@Proceedings{COLT2026,
  title =     {Proceedings of Thirty Ninth Conference on Learning Theory},
  booktitle = {Proceedings of Thirty Ninth Conference on Learning Theory},
  editor =    {Steve Hanneke and Tor Lattimore},
  publisher = {PMLR},
  series =    {Proceedings of Machine Learning Research},
  volume =    336
}


@InProceedings{pmlr-v336-hanneke26a,
  title = 	 {Conference on Learning Theory 2026: Preface},
  author =       {Hanneke, Steve and Lattimore, Tor},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {i--i},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hanneke26a/hanneke26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hanneke26a.html}
}


@InProceedings{pmlr-v336-aamand26a,
  title = 	 {How fast can you find a good hypothesis?},
  author =       {Aamand, Anders and Aliakbarpour, Maryam and Chen, Justin Y. and Silwal, Sandeep},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1--2},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/aamand26a/aamand26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/aamand26a.html},
  abstract = 	 {Hypothesis selection asks for a distribution close to an unknown $P$, given samples from $P$ and access to $n$ candidate hypotheses. We study the computational complexity of achieving statistically optimal sample complexity and approximation constants.}
}


@InProceedings{pmlr-v336-adil26a,
  title = 	 {On efficient robust regression with subquadratic samples},
  author =       {Adil, Deeksha and B{\l}asiok, Jaros{\l}aw and Chen, Hongjie and Sridharan, Deepak Narayanan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3--74},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/adil26a/adil26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/adil26a.html},
  abstract = 	 {We revisit the problem of robust linear regression under Gaussian covariates with an unknown covariance matrix of condition number $\kappa$. For this fundamental problem, significant gaps remain in our understanding of the trade-offs among sample complexity, condition number, runtime, and prediction error for efficient algorithms. Our first result is a near-linear-time algorithm that uses $\widetilde{O}(d/\varepsilon^4)$ samples, where $d$ is the dimension and $\varepsilon$ is the corruption rate, and achieves prediction error $O(\sqrt{\varepsilon\kappa})$ under the condition $\varepsilon\kappa \lesssim 1$, improving over all prior works. We complement this result with a Statistical Query (SQ) lower bound showing that efficient SQ algorithms achieving error $o(\sqrt{\varepsilon\kappa})$ when $\varepsilon \kappa \lesssim 1$ require queries that take $\Omega(d^2)$ samples to simulate. Finally, we prove a low-degree polynomial lower bound that gives fine-grained evidence that, without assumptions such as $\varepsilon \kappa \lesssim 1$, efficient algorithms may require $\tilde{\Omega}\left(\min{d\varepsilon^{2}\kappa^{2}, \varepsilon^{2}d^{2}}\right)$ samples to significantly outperform the trivial estimator that always guesses $0$.}
}


@InProceedings{pmlr-v336-ahmadi26a,
  title = 	 {Quiet Planting for $k$-SAT, Multiple Solutions of Arbitrary Geometry},
  author =       {Ahmadi, Ali and Banihashem, Kiarash and Gholami, Iman and Hajiaghayi, Mohammad Taghi and Olkowski, Jan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {75--105},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ahmadi26a/ahmadi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ahmadi26a.html},
  abstract = 	 {Recent work on “quiet planting” in combinatorial optimization aims to generate instances with a hidden solution that is hard to recover, typically by making the planted distribution statistically indistinguishable from uniform for specific algorithms, such as statistical queries. A prominent example is planted $k$-SAT, where $O(n^{k/2})$ clauses can be planted while maintaining indistinguishability from uniform instances, evidenced by prior hardness results which also align with findings in SAT refutation. Despite extensive research and practical use in benchmarking SAT solvers, the challenge of quietly planting multiple solutions while preserving hardness has remained an open problem.  This work initiates the study of quiet planting with an arbitrary number of solutions, proposing the first method to construct quiet planting distributions for $k$-SAT formulas that accommodate more than one solution. We provide statistical query lower bounds for distinguishing these planted instances from uniform ones, and our method allows for planting solutions with arbitrary geometric relationships, including varying Hamming distances. A key innovation facilitating multiple solutions is the ability to incorporate arbitrary correlations between variable selection in clauses and their negation patterns, departing from prior approaches. We also investigate the worst-case complexity of SAT by showing the difficulty in distinguishing satisfiable instances with numerous solutions from unsatisfiable ones, addressing an open problem of Hsieh, Mohanty, and Xu (CCC’22). From a technical standpoint, we generalize the concept of $(r-1)$-wise uniformness in clause distributions, proving hardness holds if the marginal distribution over negation patterns is $(r-1)$-wise uniform, and reveal a connection to binary linear codes, demonstrating how a $[k, t, r]$ code can guide the planting of up to $2^t - 1$ solutions on $k$ variables with $(r-1)$-wise uniform negation distributions.}
}


@InProceedings{pmlr-v336-aliakbarpour26a,
  title = 	 {Optimal Prediction-Augmented Algorithms for Testing Independence of Distributions},
  author =       {Aliakbarpour, Maryam and Azizi, Alireza and Stevens, Ria},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {106--157},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/aliakbarpour26a/aliakbarpour26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/aliakbarpour26a.html},
  abstract = 	 {Independence testing is a fundamental problem in statistical inference: given samples from a joint distribution $p$ over multiple random variables, the goal is to determine whether $p$ is a product distribution or is $\epsilon$-far from all product distributions in total variation distance. In the non-parametric finite-sample regime, this task is notoriously expensive, as the minimax sample complexity scales polynomially with the support size. In this work, we move beyond these worst-case limitations by leveraging the framework of augmented distribution testing. We design independence testers that incorporate auxiliary, but potentially untrustworthy, predictive information. Our framework ensures that the tester remains robust, maintaining worst-case validity regardless of the prediction’s quality, while significantly improving sample efficiency when the prediction is accurate.  Our main contributions include: (i) a bivariate independence tester for discrete distributions that adaptively reduces sample complexity based on the prediction error; (ii) a generalization to the high-dimensional multivariate setting for testing the independence of $d$ random variables; and (iii) matching minimax lower bounds demonstrating that our testers achieve optimal sample complexity.}
}


@InProceedings{pmlr-v336-amsel26a,
  title = 	 {Query Efficient Structured Matrix Learning},
  author =       {Amsel, Noah and Avi, Pratyush and Chen, Tyler and Duman Keles, Feyza and Hegde, Chinmay and Musco, Christopher and Musco, Cameron and Persson, David},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {158--194},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/amsel26a/amsel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/amsel26a.html},
  abstract = 	 {We study the problem of learning a structured approximation (low-rank, sparse, banded, etc.) to an unknown matrix $\boldsymbol{\mathbf{A}}$ given access to matrix-vector product (matvec) queries of the form $\boldsymbol{\mathbf{x}} \mapsto \boldsymbol{\mathbf{A}}\boldsymbol{\mathbf{x}}$ and $\boldsymbol{\mathbf{x}} \mapsto \boldsymbol{\mathbf{A}}^\transpose \boldsymbol{\mathbf{x}}$. This problem is of central importance in scientific computing and machine learning, with applications to structured matrix compression, preconditioning, and as a theoretical model for operator learning.  Prior work focuses on obtaining query complexity upper and lower bounds for learning specific structured matrix families that commonly arise in applications.  We initiate the study of the problem in greater generality, aiming to understand the query complexity of learning approximations from general matrix families. Our main result focuses on finding a near-optimal approximation to $\boldsymbol{\mathbf{A}}$ from any \emph{finite-sized} family of matrices, $\mathcal{F}$. Standard results from matrix sketching show that $O(\log|\mathcal{F}|)$ matvec queries suffice in this setting. This bound can also be achieved, and is optimal, for vector-matrix-vector queries of the form $\boldsymbol{\mathbf{x}},\boldsymbol{\mathbf{y}}\mapsto \boldsymbol{\mathbf{x}}^\transpose\boldsymbol{\mathbf{A}}\boldsymbol{\mathbf{y}}$, which have been widely studied in work on rank-$1$ matrix sensing. Surprisingly, we show that it is possible to obtain a nearly quadratic improvement in matvec complexity, to $\tilde{O}(\sqrt{\log|\mathcal{F}|})$ and we prove that this bound is tight up to log-log factors. Via covering number arguments, our result extends to well-studied infinite families. For example, we show that a near-optimal approximation from any \emph{linear matrix family} of dimension $q$ can be learned with $\tilde{O}(\sqrt{q})$ matvec queries, improving on an $O(q)$ bound achievable via sketching techniques.}
}


@InProceedings{pmlr-v336-anagnostides26a,
  title = 	 {Swap Regret Minimization Through Response-Based Approachability},
  author =       {Anagnostides, Ioannis and Farina, Gabriele and Fishelson, Maxwell and Luo, Haipeng and Schneider, Jon},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {195--223},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/anagnostides26a/anagnostides26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/anagnostides26a.html},
  abstract = 	 {We consider the problem of minimizing different notions of swap regret in online optimization. These forms of regret are tightly connected to correlated equilibrium concepts in games, and have been more recently shown to guarantee non-manipulability against strategic adversaries. The only computationally efficient algorithm for minimizing linear swap regret over a general convex set in $\mathbb{R}^d$ was developed recently by Daskalakis, Farina, Fishelson, Pipis, and Schneider (STOC ’25). However, it incurs a highly suboptimal regret bound of $\Omega(d^4 \sqrt{T})$ and also relies on computationally intensive calls to the ellipsoid algorithm at each iteration. In this paper, we develop a significantly simpler, computationally efficient algorithm that guarantees $O(d \sqrt{T})$ linear swap regret for a general convex set that has been preconditioned via the John ellipsoid. Our algorithm leverages the powerful response-based approachability framework of Bernstein and Shimkin (JMLR ’15)—previously overlooked in the line of work on swap regret minimization—and simultaneously minimizes profile swap regret, which was recently shown to guarantee non-manipulability. Moreover, we establish a matching information-theoretic lower bound: any learner must incur in expectation $\Omega(d \sqrt{T})$ linear swap regret for large enough $T$, even when the set is centrally symmetric. This also shows that the classic algorithm of Gordon, Greenwald, and Marks (ICML ’08) is existentially optimal for minimizing linear swap regret, although it is computationally inefficient. Finally, we extend our approach to minimize regret with respect to the set of swap deviations with polynomial dimension, unifying and strengthening recent results in equilibrium computation and online learning.}
}


@InProceedings{pmlr-v336-anderson26a,
  title = 	 {Dimension Reduction via Sum-of-Squares and Improved Clustering Algorithms for Non-Spherical Mixtures},
  author =       {Anderson, Prashanti and Bafna, Mitali and Buhai, Rares-Darius and Kothari, Pravesh K. and Steurer, David},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {224--289},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/anderson26a/anderson26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/anderson26a.html},
  abstract = 	 {We develop a new approach for clustering non-spherical (i.e., arbitrary component covariances) Gaussian mixture models via a subroutine based on the sum-of-squares method that finds a low-dimensional separation-preserving projection of the input data. Our method provides a non-spherical analog of the classical dimension reduction based on singular value decomposition that, among several other applications, forms a key component of the celebrated spherical clustering algorithm of Vempala and Wang (2004). As applications, we obtain an algorithm to (1) cluster an arbitrary total-variation separated mixture of $k$ centered (i.e., zero-mean) Gaussians with $n\geq \mathrm{poly}(d) f(w_{\min}^{-1})$ samples and $\mathrm{poly}(n)$ time, and (2) cluster an arbitrary total-variation separated mixture of $k$ Gaussians with identical but arbitrary unknown covariance with $n \geq d^{O(\log w_{\min}^{-1})} f(w_{\min}^{-1})$ samples and $n^{O(\log w_{\min}^{-1})}$ time. Here, $w_{\min}$ is the minimum mixing weight of the input mixture, and $f$ does not depend on the dimension $d$. Our algorithms naturally extend to tolerate a dimension-independent fraction of arbitrary outliers. Before this work, the techniques in the state-of-the-art non-spherical clustering algorithms needed $d^{O(k)} f(w_{\min}^{-1})$ samples and time for clustering such mixtures.  Our results may come as a surprise in the context of the $d^{\Omega(k)}$ statistical query and sum-of-squares lower bounds (Diakonikolas et al. (2017, 2024)) for clustering non-spherical Gaussian mixtures. While these results are usually thought to rule out $d^{o(k)}$ cost algorithms for the problem, our results show that the lower bounds can, in fact, be circumvented for a remarkably general class of Gaussian mixtures.}
}


@InProceedings{pmlr-v336-applebaum26a,
  title = 	 {Statistical Learning from Attribution Sets},
  author =       {Applebaum, Lorne and Busa-Fekete, Robert and Chen, August and Gentile, Claudio and Koren, Tomer and Mokhtari, Aryan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {290--336},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/applebaum26a/applebaum26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/applebaum26a.html},
  abstract = 	 {We address the problem of training conversion prediction models in advertising domains under privacy constraints, where direct links between ad clicks and conversions are unavailable. Motivated by privacy-preserving browser APIs and the deprecation of third-party cookies, we study a setting where the learner observes a sequence of clicks and a sequence of conversions, but can only link a conversion to a set of candidate clicks (an attribution set) rather than a unique source. We formalize this as learning from attribution sets generated by an oblivious adversary equipped with a prior distribution over the candidates. Despite the lack of explicit labels, we construct an unbiased estimator of the population loss from these coarse signals via a novel approach. Leveraging this estimator, we show that Empirical Risk Minimization achieves generalization guarantees that scale with the informativeness of the prior and is also robust against estimation errors in the prior, despite complex dependencies among attribution sets. Simple empirical evaluations on standard datasets suggest our unbiased approach significantly outperforms common industry heuristics, particularly in regimes where attribution sets are large or overlapping.}
}


@InProceedings{pmlr-v336-armacki26a,
  title = 	 {Tight Long-Term Tail Decay of (Clipped) SGD in Non-Convex Optimization},
  author =       {Armacki, Aleksandar and Bajovi\'{c}, Dragana and Jakoveti\'{c}, Du\v{s}an and Kar, Soummya and Sayed, Ali H},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {337--370},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/armacki26a/armacki26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/armacki26a.html},
  abstract = 	 {The study of tail behaviour of \textbf{\texttt{SGD}}-induced processes has been attracting a lot of interest, due to offering strong guarantees with respect to individual runs of an algorithm. While many works provide high-probability guarantees, quantifying the error rate for a fixed probability threshold, there is a lack of work directly studying the probability of failure, i.e., quantifying the tail decay rate for a fixed error threshold. Moreover, existing results are of finite-time nature, limiting their ability to capture the true long-term tail decay which is more informative for modern learning models, typically trained for millions of iterations. Our work closes these gaps, by studying the long-term tail decay of \textbf{\texttt{SGD}}-based methods through the lens of large deviations theory, establishing several strong results in the process. First, we provide an upper bound on the tails of the gradient norm-squared of the best iterate produced by (vanilla) \textbf{\texttt{SGD}}, for non-convex costs and bounded noise, with long-term decay at rate $e^{-\frac{t}{\log(t)}}$. Next, we relax the noise assumption by considering clipped \textbf{\texttt{SGD}} (\textbf{\texttt{c-SGD}}) under heavy-tailed noise with bounded moment of order $p \in (1,2]$, showing an upper bound with long-term decay at rate $e^{-\frac{t^{\beta_p}}{\log(t)}}$, where $\beta_p = \frac{4(p-1)}{3p-2}$ for $p \in (1,2)$ and $e^{-\frac{t}{\log^2(t)}}$ for $p = 2$. Finally, we provide lower bounds on the tail decay, at rate $e^{-t}$, showing that our rates for both \textbf{\texttt{SGD}} and \textbf{\texttt{c-SGD}} are tight, up to poly-logarithmic factors. Notably, our results demonstrate \textit{an order of magnitude faster} long-term tail decay compared to existing work based on finite-time bounds, which show rates $e^{-\sqrt{t}}$ and $e^{-t^{\beta_p/2}}$, $p \in (1,2]$, for \textbf{\texttt{SGD}} and \textbf{\texttt{c-SGD}}, respectively. As such, we uncover regimes where the tails decay much faster than previously known, providing stronger long-term guarantees for individual runs.}
}


@InProceedings{pmlr-v336-arunachalam26a,
  title = 	 {Learning depth-3 circuits via quantum agnostic boosting},
  author =       {Arunachalam, Srinivasan and Dutt, Arkopal and Gheorghiu, Alexandru and De Oliveira, Michael},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {371--426},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/arunachalam26a/arunachalam26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/arunachalam26a.html},
  abstract = 	 {We initiate the study of quantum agnostic learning of phase states with respect to a function class $C \subseteq {c:{0,1}^n\rightarrow {0,1}}$: given copies of an unknown $n$-qubit state $|\psi⟩$ which has fidelity $\textsf{opt}$ with a phase state $|\phi_c⟩=\frac{1}{\sqrt{2^n}}\sum_{x\in {0,1}^n}(-1)^{c(x)}|x⟩$  for some $c\in C$, output  $|\phi⟩$ which has fidelity $|⟨\phi | \psi ⟩|^2 \geq \textsf{opt}-\varepsilon$. To this end, we give agnostic learning protocols for the following classes: 1. Size-$t$ decision trees which runs in time $\textsf{poly}(n,t,1/\varepsilon)$. This also implies $k$-juntas can be agnostically learned in time $\textsf{poly}(n,2^k,1/\varepsilon)$. 2. $s$-term DNF formulas in time  $\textsf{poly}(n,(s/\varepsilon)^{\log \log (s/\varepsilon) \cdot \log(1/\varepsilon)})$. Our main technical contribution is a quantum agnostic boosting protocol which converts a “weak” agnostic learner, which outputs a parity state $|\phi⟩$ such that $|⟨\phi|\psi⟩|^2\geq \textsf{opt}/\textsf{poly}(n)$, into a “strong” learner which outputs a superposition of parity states $|\phi’⟩$ such that $|⟨\phi’|\psi⟩|^2\geq \textsf{opt} - \varepsilon$. Using quantum agnostic boosting, we give a $n^{O(\log(n/\varepsilon)\cdot \log \log n)}$-time algorithm for  $\varepsilon$-learning $\textsf{poly}(n)$-sized depth-$3$ circuits (consisting of $\textsf{AND}$, $\textsf{OR}$, $\textsf{NOT}$ gates) in the uniform $\textsf{PAC}$ model given quantum examples. Classically, obtaining an algorithm with a similar complexity has been an open question in the $\textsf{PAC}$ model and our work answers this given quantum examples. }
}


@InProceedings{pmlr-v336-asadi26a,
  title = 	 {Strongly Polynomial Time Complexity of Policy Iteration for $L_∞$ Robust MDPs},
  author =       {Asadi, Ali and Chatterjee, Krishnendu and Goharshady, Ehsan and Karrabi, Mehrdad and Montaseri, Alipasha and Pagano, Carlo},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {427--457},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/asadi26a/asadi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/asadi26a.html},
  abstract = 	 {Markov decision processes (MDPs) are a fundamental model in sequential decision making. Robust MDPs (RMDPs) extend this framework by allowing uncertainty in transition probabilities and optimizing against the worst-case realization of that uncertainty. In particular, $(s, a)$-rectangular RMDPs with $L_\infty$ uncertainty sets form a fundamental and expressive model: they subsume classical MDPs and turn-based stochastic games. We consider this model with discounted payoffs. The existence of polynomial and strongly-polynomial time algorithms is a fundamental problem for these optimization models. For MDPs, linear programming yields polynomial-time algorithms for any arbitrary discount factor, and the seminal work of Ye established strongly-polynomial time for a fixed discount factor. The generalization of such results to RMDPs has remained an important open problem. In this work, we show that a robust policy iteration algorithm runs in strongly-polynomial time for $(s, a)$-rectangular $L_\infty$ RMDPs with a constant (fixed) discount factor, resolving an important algorithmic question.}
}


@InProceedings{pmlr-v336-ashlagi26a,
  title = 	 {Margin in Abstract Spaces},
  author =       {Ashlagi, Yair and Livni, Roi and Moran, Shay and Waknine, Tom},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {458--471},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ashlagi26a/ashlagi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ashlagi26a.html},
  abstract = 	 {Margin-based learning, exemplified by linear and kernel methods, is one of the few classical settings where generalization guarantees are independent of the number of parameters. This makes it a central case study in modern highly over-parameterized learning. We ask what minimal mathematical structure underlies this phenomenon. We begin with a simple margin-based problem in arbitrary metric spaces: concepts are defined by a center point and classify points according to whether their distance lies below $r$ or above $R$. We show that whenever $R>3r$, this class is learnable in \emph{any} metric space. Thus, sufficiently large margins make learnability rely only on the triangle inequality, without any linear or analytic structure being necessary. Our first main result extends this phenomenon to concepts defined by bounded linear combinations of distance functions, and reveals a sharp threshold: there exists a universal constant such that whenever the margin is larger than this constant, the class is learnable in every metric space, while below it there exist metric spaces where it is not learnable at all. We then ask whether margin-based learnability can always be explained via an embedding into a linear space – that is, reduced to linear classification in some Banach space through a kernel-type construction. We answer this negatively by demonstrating a margin learnable class that cannot be embedded into any Banach space in which linear classification with margins is learnable.}
}


@InProceedings{pmlr-v336-aznag26a,
  title = 	 {A Complexity Measure for Active Learning in Multi-group Mean Estimation},
  author =       {Aznag, Abdellah and Cummings, Rachel and Elmachtoub, Adam N.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {472--473},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/aznag26a/aznag26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/aznag26a.html},
  abstract = 	 {We study a \emph{max-risk} objective for active learning in $d$-armed bandits: a learner adaptively allocates a budget of $T$ samples across $d$ groups to minimize the worst-case per-group uncertainty index $\max_{k\in[d]}\sigma_k^2/n_k$.  We develop a local minimax framework and prove the first general lower bound for this objective, valid for any finite-variance hypothesis class $\mathcal H$. The bound separates difficulty into three orthogonal factors: a \emph{budget} term, a \emph{heteroscedasticity} index measuring how unevenly the uncertainty is spread across arms, and a model-dependent curvature functional, the \emph{Variance Local Curvature} ($\mathrm{VLC}$), which captures how much information a local change of variance creates inside $\mathcal H$.  For smooth classes, the $\mathrm{VLC}$ is a reparametrization of a variance–Fisher information, with closed-form values for common families. Benchmarking against the strongest available upper bound shows near-optimality up to logarithmic factors in broad regimes, and pinpoints a systematic gap in highly heterogeneous instances. Our proof introduces two key ingredients: a loss-induced $\ell_1$ geometry on the decision space, and a representation-based instance generator that reduces hard-instance construction to an explicit random matrix calculation.}
}


@InProceedings{pmlr-v336-bahmani26a,
  title = 	 {Variational Tail Bounds for Norms of Random Vectors and Matrices},
  author =       {Bahmani, Sohail},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {474--504},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/bahmani26a/bahmani26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/bahmani26a.html},
  abstract = 	 {We propose a variational tail bound for norms of random vectors and matrices under moment assumptions on their one-dimensional marginals. A simplified version of the bound that parametrizes the “aggregating distribution” using a certain pushforward of the Gaussian distribution is also provided. We apply the proposed method to reproduce some of the well-known bounds on norms of Gaussian random vectors, and also obtain dimension-free tail bounds for the Euclidean norm of random vectors with arbitrary moment profiles. Furthermore, we reproduce a dimension-free concentration inequality for sum of independent and identically distributed positive semidefinite matrices with sub-exponential marginals, and obtain a concentration inequality for the sample covariance matrix of sub-exponential random vectors. We also obtain a tail bound for the operator norm of a random matrix series whose random coefficients may have arbitrary moment profiles. Furthermore, we use coupling to formulate an abstraction of the proposed approach that applies more broadly. As a corollary, we derive a PAC-Bayesian-style bound in terms of a certain combination of the KL and Rényi divergences between the prior and posterior distributions.}
}


@InProceedings{pmlr-v336-bansal26a,
  title = 	 {Cloning is as Hard as Learning for Stabilizer States},
  author =       {Bansal, Nikhil and Caro, Matthias C. and Mahajan, Gaurav},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {505--558},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/bansal26a/bansal26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/bansal26a.html},
  abstract = 	 { The impossibility of simultaneously cloning non-orthogonal states lies at the foundations of quantum theory. Even when allowing for approximation errors, cloning an arbitrary unknown pure state requires as many initial copies as needed to fully learn the state. Rather than arbitrary unknown states, modern quantum learning theory often considers structured classes of states and exploits such structure to develop learning algorithms that outperform general-state tomography. This raises the question: How do the sample complexities of learning and cloning relate for such structured classes? We answer this question for an important class of states. Namely, for $n$-qubit stabilizer states, we show that the optimal sample complexity of cloning is $\Theta(n)$.  Thus, also for this structured class of states, cloning is as hard as learning. To prove this result, we use representation-theoretic tools in the recently proposed Abelian State Hidden Subgroup framework and a new structured version of the recently introduced random purification channel to relate stabilizer state cloning to a variant of the sample amplification problem for probability distributions that was recently introduced in classical learning theory. This allows us to obtain our cloning lower bounds by proving new sample amplification lower bounds for classes of distributions with an underlying linear structure.  Our results provide a more fine-grained perspective on No-Cloning theorems, opening up connections from foundations to quantum learning theory and quantum cryptography. }
}


@InProceedings{pmlr-v336-barzilai26a,
  title = 	 {Limitations of SGD for Multi-Index Models Beyond Statistical Queries},
  author =       {Barzilai, Daniel and Shamir, Ohad},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {559--612},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/barzilai26a/barzilai26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/barzilai26a.html},
  abstract = 	 {Understanding the limitations of gradient methods, and stochastic gradient descent (SGD) in particular, is a central challenge in learning theory. To that end, a commonly used tool is the Statistical Queries (SQ) framework, which studies performance limits of algorithms based on noisy interaction with the data. However, it is known that the formal connection between the SQ framework and SGD is tenuous: Existing results typically rely on adversarial or specially-structured gradient noise that does not reflect the noise in standard SGD, and (as we point out here) can sometimes lead to incorrect predictions. Moreover, many analyses of SGD for challenging problems rely on non-trivial algorithmic modifications, such as restricting the SGD trajectory to the sphere or using very small learning rates. To address these shortcomings, we develop a new, non-SQ framework to study the limitations of standard vanilla SGD, for single-index and multi-index models (namely, when the target function depends on a low-dimensional projection of the inputs). Our results apply to a broad class of settings and architectures, including (potentially deep) neural networks.}
}


@InProceedings{pmlr-v336-bateni26a,
  title = 	 {Algorithmic Thinking Theory},
  author =       {Bateni, MohammadHossein and Cohen-Addad, Vincent and Gu, Yuzhou and Lattanzi, Silvio and Meierhans, Simon and Mohri, Christopher},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {613--639},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/bateni26a/bateni26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/bateni26a.html},
  abstract = 	 {Large language models (LLMs) have proven to be highly effective for solving complex reasoning tasks. Surprisingly, their capabilities can often be improved by iterating on previously generated solutions. In this context, a reasoning plan for generating and combining a set of solutions can be thought of as an algorithm for reasoning using a probabilistic oracle. We introduce a theoretical framework for analyzing such reasoning algorithms. This framework formalizes the principles underlying popular techniques for iterative improvement and answer aggregation, providing a foundation for designing a new generation of more powerful reasoning methods. Unlike approaches for understanding models that rely on architectural specifics, our model is grounded in experimental evidence. As a result, it offers a general perspective that may extend to a wide range of current and future reasoning oracles. }
}


@InProceedings{pmlr-v336-bechavod26a,
  title = 	 {Omniprediction with Long-Term Constraints},
  author =       {Bechavod, Yahav and Lu, Jiuyao and Roth, Aaron},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {640--683},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/bechavod26a/bechavod26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/bechavod26a.html},
  abstract = 	 {We introduce and study the problem of online omniprediction with long-term constraints. At each round, a forecaster is tasked with generating predictions for an underlying (adaptively, adversarially chosen) state that are broadcast to a collection of downstream agents, who must each choose an action. Each of the downstream agents has both a utility function mapping actions and state to utilities, and a vector-valued constraint function mapping actions and states to vector-valued costs. The utility and constraint functions can arbitrarily differ across downstream agents. Their goal is to choose actions that guarantee themselves no regret while simultaneously guaranteeing that they do not cumulatively violate the constraints across time. We show how to make a single set of predictions so that each of the downstream agents can guarantee this by acting as a simple function of the predictions, guaranteeing each of them $\tilde{O}(|\mathcal{A}|\sqrt{T})$ regret and $O(|\mathcal{A}|)$ cumulative constraint violation for a finite action space $\mathcal{A}$. We also show how to extend our guarantees to arbitrary intersecting contextually defined \emph{subsequences}, guaranteeing each agent both regret and constraint violation bounds not just marginally, but simultaneously on each subsequence, against a benchmark set of actions simultaneously tailored to each subsequence.}
}


@InProceedings{pmlr-v336-bhaskara26a,
  title = 	 {Adaptive Weighted Averaging},
  author =       {Bhaskara, Aditya and Cutkosky, Ashok and Kumar, Ravi and Purohit, Manish},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {684--707},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/bhaskara26a/bhaskara26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/bhaskara26a.html},
  abstract = 	 {We study the problem of selecting the largest among $n$ unknown values $x_1,…,x_n$ given only a single unbiased estimate $y_i$ for each $x_i$. We design strategies that are simultaneously admissible (not uniformly dominated by any other strategy) and also never worse than a given baseline such as uniform random selection. We provide an application to stochastic optimization, where we obtain online-to-batch conversion bounds with a desirable “no-compromise” guarantee: they are never worse than standard random iterate selection, and yet can be significantly better in benign settings.}
}


@InProceedings{pmlr-v336-black26a,
  title = 	 {Actively Learning Halfspaces without Synthetic Data},
  author =       {Black, Hadley and Larsen, Kasper Green and Mazumdar, Arya and Saha, Barna and So, Geelon},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {708--728},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/black26a/black26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/black26a.html},
  abstract = 	 {In the classic point location problem, one is given an arbitrary dataset X in d-dimensional Euclidean space R^d of n points with query access to an unknown halfspace f: R^d -> {0,1}, and the goal is to learn the label of every point in X. This problem is extremely well-studied and a nearly-optimal \tilde{O}(d log n) query algorithm is known due to Hopkins-Kane-Lovett-Mahajan (FOCS 2020). However, their algorithm is granted the power to query arbitrary points outside of X (point synthesis), and in fact without this power there is an \Omega(n) query lower bound due to Dasgupta (NeurIPS 2004). Nonetheless, query access to arbitrary synthesized data points is unrealistic in many contexts. Our objective in this work is to design efficient algorithms for learning halfspaces without point synthesis. To circumvent the \Omega(n) lower bound, we consider learning halfspaces whose normal vectors come from a known set of size D, and show tight bounds \Theta(D + log n). As a corollary, we obtain an optimal O(d + log n) query deterministic learner for the fundamental class of decision stumps (depth-one decision trees, or axis-aligned halfspaces), closing a previous gap of O(d log n) vs. \Omega(d + log n) left open in the active learning literature. In fact, our algorithm solves the more general problem of learning a Boolean function f over n elements which is monotone under at least one of D provided orderings of these elements. Our technical insight is to exploit the structure in these orderings to essentially perform a binary search in parallel rather than considering each ordering sequentially, and we believe our approach may be of broader interest. Furthermore, we use our exact learning algorithm to obtain nearly optimal algorithms for PAC-learning. We show that O(min(D + log(1/\epsilon), 1/\epsilon) * log D) queries suffice to learn f within error \epsilon, even in a setting when f can be adversarially corrupted on a c\epsilon-fraction of points, for a sufficiently small constant c. This bound is optimal up to a log D factor, including in the realizable setting.}
}


@InProceedings{pmlr-v336-blanchard26a,
  title = 	 {Characterizing Online and Private Learnability under Distributional Constraints via Generalized Smoothness},
  author =       {Blanchard, Mo{\"i}se and Shetty, Abhishek and Rakhlin, Alexander},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {729--759},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/blanchard26a/blanchard26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/blanchard26a.html},
  abstract = 	 {Understanding minimal assumptions that enable learning and generalization is perhaps the central question of learning theory. Several celebrated results in statistical learning theory, such as the VC theorem and Littlestone’s characterization of online learnability, establish conditions on the hypothesis class that allow for learning under independent data and adversarial data, respectively. Building upon recent work bridging these extremes, we study sequential decision making under \emph{distributional adversaries} that can adaptively choose data-generating distributions from a fixed family $\mathcal{U}$ and ask when such problems are learnable with sample complexity that behaves like the favorable independent case. We provide a near complete characterization of families $\mathcal{U}$ that admit learnability in terms of a notion known as \emph{generalized smoothness}, i.e., a distribution family admits VC-dimension-dependent regret bounds for every finite-VC hypothesis class if and only if it is generalized smooth. Further, we give universal algorithms that achieve low regret under any generalized smooth adversary without explicit knowledge of $\mathcal{U}$. Finally, when $\mathcal{U}$ is known, we provide refined bounds in terms of a combinatorial parameter, the fragmentation number, that captures how many disjoint regions can carry nontrivial mass under $\mathcal{U}$. These results provide a nearly complete understanding of learnability under distributional adversaries. In addition, building upon the surprising connection between online learning and differential privacy, we show that the generalized smoothness also characterizes private learnability under distributional constraints.}
}


@InProceedings{pmlr-v336-block26a,
  title = 	 {Partition Function Estimation under Bounded $f$-Divergence},
  author =       {Block, Adam and Shetty, Abhishek},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {760--790},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/block26a/block26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/block26a.html},
  abstract = 	 {We study the statistical complexity of estimating partition functions given sample access to a proposal distribution and an unnormalized density ratio for a target distribution. While partition function estimation is a classical problem, existing guarantees typically rely on structural assumptions about the domain or model geometry. We instead provide a general, information-theoretic characterization that depends only on the relationship between the proposal and target distributions. Our analysis introduces the integrated coverage profile, a functional that quantifies how much target mass lies in regions where the density ratio is large. We show that integrated coverage tightly characterizes the sample complexity of multiplicative partition function estimation and provide matching lower bounds. We further express these bounds in terms of $f$-divergences, yielding sharp phase transitions depending on the growth rate of $f$ and recovering classical results as a special case while extending to heavy-tailed regimes. Matching lower bounds establish tightness in all regimes. As applications, we derive improved finite-sample guarantees for importance sampling and self-normalized importance sampling, and we show a strict separation between the complexity of approximate sampling and counting under the same divergence constraints. Our results unify and generalize prior analyses of importance sampling, rejection sampling, and heavy-tailed mean estimation, providing a minimal-assumption theory of partition function estimation. Along the way we introduce new technical tools including new connections between coverage and $f$-divergences as well as a generalization of the classical Paley-Zygmund inequality.}
}


@InProceedings{pmlr-v336-blondal26a,
  title = 	 {Tight list replicability bounds via a novel sphere covering theorem},
  author =       {Blondal, Ari and Hatami, Hamed and Hatami, Pooya and Lalov, Chavdar and Tretiak, Sivan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {791--807},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/blondal26a/blondal26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/blondal26a.html},
  abstract = 	 {In recent years, list replicability has emerged as a framework for formalizing reproducibility in learning theory. A central question is how the required list size relates to the accuracy parameter and natural complexity measures of the hypothesis class.  To achieve sharp bounds on list replicability, we prove a novel topological sphere covering theorem, derived from the Borsuk-Ulam theorem.  Specifically, if the $d$-sphere is covered by open sets, each of which lies in an open hemisphere, then $d+1$ of these sets must have a common intersection. Using this result, we obtain a sharp bound on the relationship between list size and accuracy for VC classes. We also show that for large-margin half-spaces, provided the margin is not too large, the optimal list size equals the ambient dimension. However, when the margin is taken to be very large, we devise a replicable algorithm achieving the minimal list size of $\lceil d/2 \rceil + 1$.}
}


@InProceedings{pmlr-v336-braverman26a,
  title = 	 {Learning from Equivalence Queries, Revisited},
  author =       {Braverman, Mark and Livni, Roi and Mansour, Yishay and Moran, Shay and Nissim, Kobbi},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {808--836},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/braverman26a/braverman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/braverman26a.html},
  abstract = 	 {Modern machine learning systems, such as generative models and recommendation systems, often evolve through a cycle of deploying a model, observing user interactions, and updating the model intermittently based on feedback. This mode of learning contrasts with common supervised learning frameworks, which focus on loss or regret minimization over a shared sequence of prediction tasks. Motivated by this deployment-driven learning cycle, we revisit the classical model of learning from equivalence queries, introduced by Angluin, which provides a simple abstraction of such interactions: a learner repeatedly proposes hypotheses and, whenever the deployed hypothesis is inadequate, receives a counterexample tailored to that hypothesis. Under fully adversarial counterexample generation, however, this model exhibits overly pessimistic worst-case behavior. Moreover, most existing work on learning from equivalence queries considers the \emph{full-information} setting, where the learner observes not only a counterexample but also its correct label. This is an assumption that does not always align with natural interactive settings. To address these considerations, we restrict the environment to generate counterexamples in a less adversarial manner by introducing a broad class of counterexample generators, which we call \emph{symmetric}. Informally, such symmetric counterexample generators select counterexamples based only on the symmetric difference between the hypothesis and the target, and encompass natural feedback mechanisms such as random counterexamples, as well as generators that select counterexamples minimizing a prescribed complexity measure over the instance space. Within this framework, we study learning from equivalence queries under both full-information and bandit feedback. We establish tight bounds on the number of learning rounds in both settings and outline directions for future research. Our techniques rely on a game-theoretic perspective on symmetric adversaries and combine adaptive weighting algorithms with minimax arguments.}
}


@InProceedings{pmlr-v336-bressan26a,
  title = 	 {Learning Conditional Averages},
  author =       {Bressan, Marco and Brukhim, Nataly and Cesa-Bianchi, Nicol{\`o} and Esposito, Emmanuel and Mansour, Yishay and Moran, Shay and Thiessen, Maximilian},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {837--858},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/bressan26a/bressan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/bressan26a.html},
  abstract = 	 {We introduce the problem of learning \emph{conditional averages} in the PAC framework. The learner receives a sample labeled by an unknown target concept from a known concept class, as in standard PAC learning. However, instead of learning the target concept itself, the goal is to predict, for each instance, the average label over its \emph{neighborhood}—an arbitrary subset of points that contains the instance. In the degenerate case where all neighborhoods are singletons, the problem reduces exactly to classic PAC learning. More generally, it extends PAC learning to a setting that captures learning tasks arising in several domains, including explainability, fairness, and recommendation systems. Our main contribution is a complete characterization of when conditional averages are learnable, together with sample complexity bounds that are tight up to logarithmic factors. The characterization hinges on the joint finiteness of two novel combinatorial parameters, which depend on both the concept class and the neighborhood system, and are closely related to the independence number of the associated neighborhood graph.}
}


@InProceedings{pmlr-v336-bressan26b,
  title = 	 {Active Learning on Adversarially Corrupted Graphs},
  author =       {Bressan, Marco and Cesa-Bianchi, Nicol{\`o} and d'Orsi, Tommaso and Esposito, Emmanuel and Lattanzi, Silvio},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {859--895},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/bressan26b/bressan26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/bressan26b.html},
  abstract = 	 {Motivated by real-world scenarios where malicious entities tamper with existing networks, we define a model where an adversary seeks to hide a set of corrupted vertices inside a  graph $G^*$. To this end, the adversary can add edges between the corrupted vertices, as well as edges between the corrupted vertices and $G^*$, and its power is then measured by the size of the neighborhood of the corrupted vertices in $G^*$. Our goal is to design an active learning algorithm that efficiently finds the subset of corrupted vertices using a small number of label queries. We devise an efficient algorithm that approximately recovers the corrupted vertices with a query complexity that depends polynomially on both the power of the adversary and the vertex expansion of $G^*$, a fundamental measure of graph connectivity. At the heart of this result is a polynomial-time algorithm, obtained by carefully adapting sum-of-squares algorithms for approximating minimum expansion, that finds a set with small vertex expansion subject to cardinality constraints. To the best of our knowledge, this is the first time that the vertex expansion is shown to play a key role in determining the query complexity of active learning algorithms robust to structural adversarial attacks.}
}


@InProceedings{pmlr-v336-cannella26a,
  title = 	 {Universal priors: solving empirical Bayes via Bayesian inference and pretraining},
  author =       {Cannella, Nick and Teh, Anzo and Han, Yanjun and Polyanskiy, Yury},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {896--937},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/cannella26a/cannella26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/cannella26a.html},
  abstract = 	 {We theoretically justify the recent empirical finding of Teh et al. (2025) that a transformer pretrained on synthetically generated data achieves strong performance on empirical Bayes (EB) problems. We take an indirect approach to this question: rather than analyzing the model architecture or training dynamics, we ask why a pretrained Bayes estimator, trained under a prespecified training distribution, can adapt to arbitrary test distributions. Focusing on Poisson EB problems, we identify the existence of universal priors such that training under these priors yields a near-optimal regret bound of $\widetilde{O}(\frac{1}{n})$ uniformly over all test distributions. Our analysis leverages the classical phenomenon of posterior contraction in Bayesian statistics, showing that the pretrained Bayes estimator adapts to unknown test distributions precisely through posterior contraction. This perspective also explains the phenomenon of length generalization, in which the test sequence length exceeds the training length, as the model performs Bayesian inference using a fractional posterior.}
}


@InProceedings{pmlr-v336-carpentier26a,
  title = 	 {Phase Transition for Stochastic Block Model with more than $\sqrt{n}$ Communities},
  author =       {Carpentier, Alexandra and Giraud, Christophe and Verzelen, Nicolas},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {938--1000},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/carpentier26a/carpentier26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/carpentier26a.html},
  abstract = 	 {Predictions from statistical physics postulate that recovery of the communities in the Stochastic Block Model (SBM) with a fixed number $K$ of communities is possible in polynomial time above, and only above, the Kesten-Stigum (KS) threshold. This conjecture has given rise to a rich literature,  proving that non-trivial community recovery is indeed possible in SBM above the KS threshold. Failure of low-degree polynomials (LDP) below the KS threshold was also proven, as long as $K\ll \sqrt{n}$, where $n$ is the number of nodes in the observed graph. When $K\geq \sqrt{n}$, Chin et al. (2025) recently proved that,  in a \emph{sparse regime},  community recovery  in polynomial time is possible below the KS threshold by counting non-backtracking paths. This breakthrough led them to postulate a new threshold for the many-communities regime $K\geq \sqrt{n}$. In this work, we provide evidence supporting their conjecture: 1- We prove that, for \emph{any graph density}, LDP fail to recover communities below the threshold postulated by Chin et al. (2025) ; 2- We prove that community recovery is possible in polynomial time above the postulated threshold, not only in the \emph{sparse regime} considered in Chin et al. (2025), but also in \emph{moderately sparse regimes}, by counting occurrences of some specific motifs inspired by the LDP analysis. In particular, counting  self-avoiding paths of length $\log(n)$—which is closely related to spectral algorithms based on the Non-Backtracking operator—is optimal only in the sparse regime. More complex motifs based on the blow-up of a cycle must be considered in denser regimes. }
}


@InProceedings{pmlr-v336-cesa-bianchi26a,
  title = 	 {Learning Periodic Strategies in Blocking Bandits Is as Hard as Bandits with Switching Costs},
  author =       {Cesa-Bianchi, Nicol\`{o} and Honda, Junya and Kuroki, Yuko and Miyauchi, Atsushi and Zierahn, Lukas},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1001--1021},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/cesa-bianchi26a/cesa-bianchi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/cesa-bianchi26a.html},
  abstract = 	 {In blocking $K$-armed bandits, playing an arm renders it unavailable for a fixed number of future rounds. While this model is relatively well understood in the stochastic regime, much less is known when rewards are generated adversarially. Via a novel reduction, we first show that computing the total reward of the best dynamic policy is NP-hard, even when the blocking time $d > 1$ is identical across arms. We therefore turn to tractable comparators and study the class of $d$-periodic policies, proving that the optimal periodic policy is efficiently computable and always obtains at least a $\frac{1}{K}$ fraction of the dynamic optimum. We also show that this $\frac{1}{K}$ factor is information-theoretically tight: no algorithm can achieve sublinear $\alpha$-regret with respect to the offline optimal dynamic policy for any $\alpha > \frac{1}{K}$. Our main result shows that $T^{2/3}$ is the minimax rate for the regret (against periodic policies) for adversarial blocking bandits with identical blocking times, and that this rate is achievable by an efficient algorithm. Our main technical contribution is the lower bound, which establishes that blocking bandits are at least as hard as bandits with switching costs. The matching upper bound instead follows from a reduction to combinatorial semi-bandits over bipartite matchings. Finally, we show that $\sqrt{T}$ regret rates are efficiently achievable in the full information setting, and more generally via $\alpha$-regret with $\alpha = \frac{1}{2}$.}
}


@InProceedings{pmlr-v336-charikar26a,
  title = 	 {A Characterization of List Language Identification in the Limit},
  author =       {Charikar, Moses and Pabbaraju, Chirag and Tewari, Ambuj},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1022--1053},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/charikar26a/charikar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/charikar26a.html},
  abstract = 	 {We study the problem of language identification in the limit, where given a sequence of examples from a target language, the goal of the learner is to output a sequence of guesses for the target language such that all the guesses beyond some finite time are correct. Classical results of Gold showed that language identification in the limit is impossible for essentially any interesting collection of languages. Later, Angluin gave a precise characterization of language collections for which language identification is possible. Motivated by recent positive results for the related problem of language generation, we revisit the classic language identification problem in the setting where the learner is given the additional power of producing a list of $k$ guesses at each time step. The goal is to ensure that beyond some finite time, one of the guesses is correct at each time step. Such list learning versions of several basic learning problems have been widely studied. We give an exact characterization of collections of languages that can be $k$-list identified in the limit, based on a recursive version of Angluin’s characterization (for language identification with a list of size $1$). This further leads to a conceptually appealing characterization: A language collection can be $k$-list identified in the limit if and only if the collection can be decomposed into $k$ collections of languages, each of which can be identified in the limit (with a list of size $1$). We also use our characterization to establish rates for list identification in the statistical setting where the input is drawn as an i.i.d. stream from a distribution supported on some language in the collection. Our results show that if a collection is $k$-list identifiable in the limit, then the collection can be $k$-list identified at an exponential rate, and this is best possible. On the other hand, if a collection is not $k$-list identifiable in the limit, then it cannot be $k$-list identified at any rate that goes to zero.}
}


@InProceedings{pmlr-v336-charikar26b,
  title = 	 {Language Identification with Succinct Machine-Independent Traces},
  author =       {Charikar, Moses and Kleinberg, Jon and Pabbaraju, Chirag},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1054--1074},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/charikar26b/charikar26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/charikar26b.html},
  abstract = 	 {Motivated by the power of large language models, there has been renewed interest in the Gold-Angluin model of language identification in the limit, with an eye toward variants of the model that might overcome the negative results for its original formulation. Recent papers on this question have proposed looking at computational traces and annotations of training strings as a source of additional power for a learner, reflecting empirical regularities such as the way that commented source code is easier to learn from than arbitrary source code, and text annotated with algorithmically generated chain-of-thought tokens can be easier to learn from than the raw text itself. This recent work has shown positive results for language identification in the presence of such computational traces, but the traces in these positive results come from explicit automata-theoretic machine models that generate the language, where the underlying vocabulary of tokens for the traces is very large. In this paper, we address two fundamental issues left open by this line of work: can we achieve positive results with traces that use only a small alphabet, and can we define traces directly from the language itself, without requiring an underlying machine model that generates it? We establish positive results for both of these questions: for an arbitrary collection of languages, we show how to define computational traces that enable identification in the limit, using an alphabet of tokens that is linear in the size of the alphabet that the languages are defined over, and independent of any other properties of the languages. }
}


@InProceedings{pmlr-v336-chase26a,
  title = 	 {A Tight Lower Bound for Non-stochastic Multi-armed Bandits with Expert Advice},
  author =       {Chase, Zachary and Ito, Shinji and Mehalel, Idan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1075--1087},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chase26a/chase26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chase26a.html},
  abstract = 	 {We determine the minimax optimal expected regret in the classic non-stochastic multi-armed bandit with expert advice problem, by proving a lower bound that matches the upper bound of [Kale ’14]. The two bounds determine the minimax optimal expected regret to be $\Theta\left( \sqrt{T K \log \frac{N}{K} } \right)$, where $K$ is the number of arms, $N$ is the number of experts, and $T$ is the time horizon.}
}


@InProceedings{pmlr-v336-chen26a,
  title = 	 {Faster Newton Methods for Convex and Nonconvex Optimization in Gradient Complexity},
  author =       {Chen, Lesi and Liu, Chengchang and Luo, Luo and Zhang, Jingzhao},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1088--1112},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26a/chen26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26a.html},
  abstract = 	 {Second-order optimization methods are computationally expensive for large-scale problems. Recently, Doikov, Chayti, and Jaggi (ICML 2023) proposed the LazyCRN method that reduces computation by studying the gradient complexity of second-order methods. Their method can achieve a gradient complexity of $\mathcal{O}( \bar d + \bar d^{1/2} \epsilon^{-3/2})$ and $\mathcal{O}( \bar d + \bar d^{1/2} \epsilon^{-1/2})$ for nonconvex and convex optimization, respectively, where $\bar d$ is the effective dimension and $\epsilon$ is the target precision. Very recently, Adil, Bullins, Sidford, and Zhang (NeurIPS 2025) improved the gradient complexity to $\mathcal{O}( \bar d + \bar d^{1/3} \epsilon^{-3/2} \ln^{18} \epsilon^{-1})$ for nonconvex optimization. However, the tightness of these methods remains open. In this work, we propose new methods that achieve an improved complexity of $\mathcal{O}( \bar d + \bar d^{1/3} \epsilon^{-3/2})$ and $\mathcal{O}( (\bar d + \bar d^{13/21} \epsilon^{-2/7}) \ln \bar d)$ for nonconvex and convex optimization, respectively, improving best-known results for both setups.}
}


@InProceedings{pmlr-v336-chen26b,
  title = 	 {Separating Oblivious and Adaptive Models of Variable Selection (Extended Abstract)},
  author =       {Chen, Ziyun and Li, Jerry and Tian, Kevin and Zhu, Yusong},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1113--1114},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26b/chen26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26b.html},
  abstract = 	 {Sparse recovery is among the most well-studied problems in learning theory and high-dimensional statistics. In this work, we investigate the statistical and computational landscapes of sparse recovery with $\ell_\infty$ error guarantees. This variant of the problem is motivated by \emph{variable selection} tasks, where the goal is to estimate the support of a $k$-sparse signal in $\R^d$. Our main contribution is a provable separation between the \emph{oblivious} (“for each”) and \emph{adaptive} (“for all”) models of $\ell_\infty$ sparse recovery. We show that under an oblivious model, the optimal $\ell_\infty$ error is attainable in near-linear time with $\approx k\log d$ samples, whereas in an adaptive model, $\gtrsim k^2$ samples are necessary for any algorithm to achieve this bound. This establishes a surprising contrast with the standard $\ell_2$ setting, where $\approx k \log d$ samples suffice even for adaptive sparse recovery. We conclude with a preliminary examination of a \emph{partially-adaptive} model, where we show nontrivial variable selection guarantees are possible with $\approx k\log d$ measurements.}
}


@InProceedings{pmlr-v336-chen26c,
  title = 	 {Instance-optimal high-precision shadow tomography with few-copy measurements: A metrological approach},
  author =       {Chen, Senrui and Gong, Weiyuan and Zhou, Sisi},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1115--1185},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26c/chen26c.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26c.html},
  abstract = 	 {We give the first instance-optimal sample complexity bounds for shadow tomography using few-copy measurements in the high-precision regime. More concretely, we study the problem of learning expectation values of a given set of observables of an unknown quantum state to precision $\epsilon$ in $L_p$-norm, using (possibly adaptive) measurements that act on one or a few copies at a time, and we are interested in the regime that $\epsilon$ is below some concrete and potentially dimension-dependent threshold. In this setup, we prove the necessary and sufficient number of copies, for any given set of observables, is characterized by a simple optimization formula involving a quadratic form of the inverse Fisher information matrix up to a logarithmic factor. Our results establish a rigorous correspondence between quantum learning and quantum metrology. }
}


@InProceedings{pmlr-v336-chen26d,
  title = 	 {Information-computation gaps in quantum learning via low-degree likelihood},
  author =       {Chen, Sitan and Gong, Weiyuan and Haferkamp, Jonas and Quek, Yihui},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1186--1278},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26d/chen26d.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26d.html},
  abstract = 	 {In a variety of physically relevant settings for learning from quantum data, there is an established recipe for measuring polynomially many copies of that data such that the resulting measurement readouts contain enough information to reconstruct the underlying system. Yet designing protocols that can computationally efficiently extract that information remains largely an art, and there are important cases where we believe this to be impossible, that is, where there is an information-computation gap. While there is a large array of tools in the classical literature for giving evidence for average-case hardness of statistical inference problems, the corresponding tools in the quantum literature are far more limited.  One such framework in the classical literature, the low-degree method, makes predictions about hardness of inference problems based on the failure of estimators given by low-degree polynomials. In this work, we extend this framework to the quantum setting and show a number of new information-computation gaps for quantum learning. We establish a general connection between state designs and low-degree hardness. We use this to obtain the first information-computation gaps for learning Gibbs states of random, sparse, non-local Hamiltonians. We also use it to prove hardness for learning random shallow quantum circuit states in a challenging model where states can be measured in round-based adaptively chosen bases. To our knowledge, the ability to model adaptivity within the low-degree framework was open even in classical settings. In addition, we also obtain a low-degree hardness result for quantum error mitigation against strategies with single-qubit measurements. We define a new quantum generalization of the planted biclique problem and identify the threshold at which this problem becomes computationally hard for protocols that perform local measurements. Interestingly, the complexity landscape for this problem shifts when going from local measurements to more entangled single-copy measurements. We show average-case hardness for the “standard” variant of Learning Stabilizers with Noise (Poremba et al., 2024) and for agnostically learning product states (Bakshi et al., 2024a).}
}


@InProceedings{pmlr-v336-chen26e,
  title = 	 {Optimal Inference Schedules for Masked Diffusion Models},
  author =       {Chen, Sitan and Cong, Kevin and Li, Jerry},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1279--1311},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26e/chen26e.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26e.html},
  abstract = 	 { A major bottleneck of standard auto-regressive large language models is that their inference process is inherently sequential, resulting in very long and costly inference times. To circumvent this, practitioners proposed a class of language models called \emph{diffusion language models}, of which the \emph{masked diffusion model} (MDM) is the most successful. The MDM is able to sample tokens out-of-order and, ostensibly, many tokens at once and in parallel. However, there is very limited rigorous understanding of how much parallel sampling these models can perform without noticeable degradation in their sampling performance. Prior work in Li and Cai (2025) obtained some preliminary bounds, but these are not tight for many natural classes of distributions. In this work, we give a new, \emph{exact} characterization of the expected divergence between the true distribution and the sampled distribution, for any distribution and any unmasking schedule for the sampler, showing an elegant connection to the theory of \emph{univariate function approximation}.  By leveraging this connection, we then attain a number of novel lower and upper bounds for this problem.  While the connection to function approximation in principle gives the optimal unmasking schedule for any distribution, we show that it is in general impossible to compete with it without strong \emph{a priori} knowledge of the distribution, even in seemingly benign settings. However, we also demonstrate new upper bounds and new sampling schedules in terms of well-studied information-theoretic properties of the base distribution, namely, its \emph{total correlation} and \emph{dual total correlation}, which show that in some natural settings, one can sample in $O(\log n)$ steps without any visible loss in performance, where $n$ is the total sequence length. }
}


@InProceedings{pmlr-v336-chen26f,
  title = 	 {Self-Normalized Martingales and Uniform Regret Bounds for Linear Regression},
  author =       {Chen, Fan and Qian, Jian and Rakhlin, Alexander and Zhivotovskiy, Nikita},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1312--1340},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26f/chen26f.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26f.html},
  abstract = 	 { Self-normalized martingale inequalities lie at the heart of confidence ellipsoids for online least squares and, more broadly, many bandit and reinforcement-learning results. Yet existing vector and scalar results typically rely on bounded covariates and an explicit regularization matrix, producing bounds that are \emph{not scale-invariant}: although the self-normalized quantity is scale-invariant by definition, its standard upper bounds are not. We characterize when scale-invariant upper bounds on self-normalized martingales are possible. Without further assumptions, we prove that nontrivial scale-invariant bounds exist only in dimension $d=1$; moreover, in $d=1$ we obtain $O(\log T)$ scale-invariant self-normalized bounds without any assumptions on the covariates. In contrast, for $d>1$ we show that no nontrivial scale-invariant bound can hold in full generality. We then connect this dichotomy to \emph{doubly-uniform} regret in online linear regression (i.e., regret bounds that are simultaneously independent of the covariate scale and the comparator norm) and use it to resolve the open question of Gaillard, Gerchinovitz, Huard, and Stoltz, \emph{“Uniform regret bounds over $\mathbb{R}^d$ for the sequential linear regression problem with the square loss”} (ALT 2019): in $d=1$ we give an explicit algorithm with $O(\log T)$ doubly-uniform regret, whereas for $d>1$ sublinear doubly-uniform regret is impossible. Finally, under a natural \emph{smoothness} condition (bounded Radon–Nikodym derivatives of the conditional covariate laws with respect to a fixed base measure), we recover sublinear regret for $d>1$ without bounded covariates and derive a self-normalized concentration inequality free of the usual regularization penalties, yielding arguably a first natural scale-invariant bound for adaptive, non-i.i.d. vector martingales. }
}


@InProceedings{pmlr-v336-chen26g,
  title = 	 {High-Accuracy Log-Concave Sampling with Stochastic Queries},
  author =       {Chen, Fan and Chewi, Sinho and Daskalakis, Constantinos and Rakhlin, Alexander},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1341--1372},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26g/chen26g.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26g.html},
  abstract = 	 { We show that high-accuracy guarantees for log-concave sampling—that is, iteration and query complexities which scale as $\mathrm{poly}\log(1/\delta)$, where $\delta$ is the desired target accuracy—are achievable using stochastic gradients with sub-exponential tails. Notably, this exhibits a separation with the problem of convex optimization, where stochasticity (even additive Gaussian noise) in the gradient oracle incurs $\mathrm{poly}(1/\delta)$ queries. We also give an information-theoretic argument that light-tailed stochastic gradients are necessary for high accuracy: for example, in the bounded variance case, we show that the minimax-optimal query complexity scales as $\Theta(1/\delta)$. Our framework also provides similar high-accuracy guarantees under stochastic zeroth-order (value) queries, and an improved complexity result for sampling from finite-sum potentials. }
}


@InProceedings{pmlr-v336-chen26h,
  title = 	 {Calibeating Made Simple},
  author =       {Chen, Yurong and Huang, Zhiyi and Jordan, Michael I. and Luo, Haipeng},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1373--1398},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chen26h/chen26h.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chen26h.html},
  abstract = 	 { We study calibeating, the problem of post-processing external forecasts online to minimize cumulative losses and match an informativeness-based benchmark.  Unlike prior work, which analyzed calibeating for specific losses with specific arguments, we reduce calibeating to existing online learning techniques and obtain results for general proper losses. More concretely, we first show that calibeating is minimax-equivalent to regret minimization.  This recovers the $O(\log T)$ calibeating rate of Foster and Hart (2023) for the Brier and log losses and its optimality, and yields new optimal calibeating rates for exp-concave losses and general bounded losses. Second, we prove that multi-calibeating is minimax-equivalent to the combination of calibeating and the  classical expert problem. This yields new optimal multi-calibeating rates for exp-concave losses, including Brier and log losses, and general bounded losses. Finally, we obtain new bounds for achieving calibeating and calibration simultaneously for the Brier loss. For binary predictions, our result gives the first calibrated algorithm that at the same time also achieves the optimal $O(\log T)$ calibeating rate.}
}


@InProceedings{pmlr-v336-cheng26a,
  title = 	 {Is Memorization Helpful or Harmful? Prior Information Sets the Threshold},
  author =       {Cheng, Chen and Barber, Rina Foygel},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1399--1433},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/cheng26a/cheng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/cheng26a.html},
  abstract = 	 {We examine the connection between training error and generalization error for arbitrary estimating procedures, working in an overparameterized linear model under general priors in a Bayesian setup. We find determining factors inherent to the prior distribution $\pi$, giving explicit conditions under which optimal generalization necessitates that the training error be (i) near interpolating relative to the noise size (i.e., memorization is necessary), or (ii) close to the noise level (i.e., overfitting is harmful). Remarkably, these phenomena occur when the noise reaches thresholds determined by the Fisher information and the variance parameters of the prior $\pi$.}
}


@InProceedings{pmlr-v336-chewi26a,
  title = 	 {{DDPM} Score Matching and Distribution Learning (Extended Abstract)},
  author =       {Chewi, Sinho and Kalavasis, Alkis and Mehrotra, Anay and Montasser, Omar},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1434--1435},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/chewi26a/chewi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/chewi26a.html},
  abstract = 	 {Score estimation is the backbone of score-based generative models (SGMs), and particularly denoising diffusion probabilistic models (DDPMs). A fundamental theoretical result in this area is that, given access to accurate score estimates, SGMs can efficiently generate from any realistic data distribution (Chen, Chewi, Li, Li, Salim, and Zhang, ICLR’23; Lee, Lu, and Tan, ALT’23). This can be viewed as a result on distribution learning, where the learned distribution is implicit as the law of the output of a sampler. However, it is unclear how score estimation relates to more classical forms of distribution learning, such as parameter estimation and density estimation. We present a framework reducing the other two forms of distribution learning to score estimation, which has various implications in statistical and computational learning theory: parameter estimation, where denoising score matching in DDPMs is asymptotically efficient; density estimation, where estimated scores can be lifted to a $(\epsilon,\delta)$-PAC density estimator and yield minimax rates over Hölder classes and a quasi-polynomial PAC density estimation algorithm for Gaussian location mixtures; and lower bounds for score estimation, where PAC density estimation yields computational lower bounds for score estimation of general distribution families and cryptographic lower bounds for score estimation of general Gaussian mixture models.}
}


@InProceedings{pmlr-v336-compton26a,
  title = 	 {Density estimation for Hellinger via minimum-distance estimators: mixtures of Gaussians, log-concave, and more},
  author =       {Compton, Spencer and Li, Jerry},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1436--1475},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/compton26a/compton26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/compton26a.html},
  abstract = 	 {We study the task of density estimation, where we hope to accurately estimate a probability density from $n$ samples. A textbook method for density estimation in total variation distance is the minimum-distance estimator approach, where we conclude both the algorithm and the analysis merely from bounding the VC dimension of a particular concept class (the so-called Yatracos class).  While this technique has originally yielded sharp guarantees primarily for total variation distance, in this work we extend the minimum-distance estimator approach for learning within Hellinger distance. Our main observation is that we may produce an analogous recipe for Hellinger (where we only require bounding the VC dimension of a related concept class) by drawing connections to recent results yielding reverse data processing inequalities.  This recipe is flexible enough to accommodate fast algorithms originally designed for total variation distance; by modifying the approach of Acharya et al. (2017) we conclude the first near-linear time algorithm for learning classes including univariate mixtures of log-concave densities and mixtures of Gaussians (with arbitrary variances), with near-optimal sample complexity.}
}


@InProceedings{pmlr-v336-cranston26a,
  title = 	 {Eigen-Spike Emergence and Quadratic Equivalents for Conjugate Kernels on Nonlinearly Separable Data},
  author =       {Cranston, Collin and Wang, Zhichao and Kemp, Todd and Mahoney, W. Michael},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1476--1574},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/cranston26a/cranston26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/cranston26a.html},
  abstract = 	 {Recent work in random matrix theory (RMT) has developed the notion of deterministic equivalents: typically linear surrogate models that approximate the spectral behavior of large nonlinear random matrices, such as nonlinear feature maps in neural networks (NNs). Such equivalents make theoretical predictions tractable by reducing a complex model to a simpler one with properties that fall under the umbrella of classical RMT tools. However, this leaves open the question of whether this idealized linear equivalence remains meaningful when dealing with classification problems of high-dimensional nonlinearly separable data. Motivated by this, we consider the conjugate kernel (CK), which is the nonlinear feature map of a one-layer feedforward NN, under a canonical nonlinearly separable dataset for the XOR problem; and we use the study of informative outlier eigenvalues in the CK and whether their corresponding eigenvectors asymptotically align with XOR labels as a proxy for nonlinear learnability. We develop a robust quadratic equivalent of the CK matrix that enables a precise analysis of emergent informative spikes, as one modifies various knobs common in ML practice: sample complexity, signal-to-noise ratio (SNR), nonlinear activation choice, and pretrained features. We identify regimes in which these knobs move the CK beyond the linear equivalent and produce BBP-type transitions to label-aligned outlier eigenspaces. Our analysis helps bring deterministic-equivalence tools from RMT to bear on problems of practical relevance in ML.}
}


@InProceedings{pmlr-v336-crawshaw26a,
  title = 	 {Tight Bounds for Logistic Regression with Large Stepsize Gradient Descent in Low Dimension},
  author =       {Crawshaw, Michael and Liu, Mingrui},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1575--1610},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/crawshaw26a/crawshaw26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/crawshaw26a.html},
  abstract = 	 {We consider the optimization problem of minimizing the logistic loss with gradient descent to train a linear model for binary classification with separable data. With a budget of $T$ iterations, it was recently shown that an accelerated $1/T^2$ rate is possible by choosing a large stepsize $\eta = \Theta(\gamma^2 T)$ (where $\gamma$ is the dataset’s margin) despite the resulting non-monotonicity of the loss. In this paper, we provide a tighter analysis of gradient descent for this problem when the data is two-dimensional: we show that GD with a sufficiently large learning rate $\eta$ finds a point with loss smaller than $\mathcal{O}(1/(\eta \gamma^2 T))$, as long as $T \geq \Omega(n/\gamma + 1/\gamma^2)$, where $n$ is the dataset size. Our improved rate comes from a tighter bound on the time $\tau$ that it takes for GD to transition from unstable (non-monotonic loss) to stable (monotonic loss), via a fine-grained analysis of the oscillatory dynamics of GD in the subspace orthogonal to the max-margin classifier. We also provide a lower bound of $\tau$ matching our upper bound up to logarithmic factors, showing that our analysis is tight.}
}


@InProceedings{pmlr-v336-dandi26a,
  title = 	 {Rigorous Asymptotics for First-Order Algorithms Through the Dynamical Cavity Method},
  author =       {Dandi, Yatin and Gamarnik, David and Pernice, Francisco and Zdeborov{\'a}, Lenka},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1611--1646},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/dandi26a/dandi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/dandi26a.html},
  abstract = 	 {Dynamical Mean Field Theory (DMFT) provides an asymptotic description of the dynamics of macroscopic observables in certain disordered systems. Originally pioneered in the context of spin glasses, it has since been used to derive asymptotic dynamical equations for a wide range of models in physics, high-dimensional statistics and machine learning. One of the main tools used by physicists to obtain these equations is the dynamical cavity method, which has remained largely non-rigorous. In contrast, existing mathematical formalizations have relied on alternative approaches, including Gaussian conditioning, large deviations over paths, or Fourier analysis. In this work, we formalize the dynamical cavity method and use it to give a new proof of the DMFT equations for General First Order Methods, a broad class of dynamics encompassing algorithms such as Gradient Descent and Approximate Message Passing.}
}


@InProceedings{pmlr-v336-daskalakis26a,
  title = 	 {Estimating Ising Models in Total Variation Distance},
  author =       {Daskalakis, Constantinos and Kandiros, Vardis and Yao, Rui},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1647--1714},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/daskalakis26a/daskalakis26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/daskalakis26a.html},
  abstract = 	 {We consider the problem of estimating Ising models over n variables in Total Variation (TV) distance, given l independent samples from the model. While the statistical complexity of the problem is well-understood Devroye et al. (2020), identifying computationally and statistically efficient algorithms has been challenging. In particular, remarkable progress has occurred in several settings, such as when the underlying graph is a tree Daskalakis and Pan (2021); Bhattacharyya et al. (2021), when the entries of the interaction matrix follow a Gaussian distribution Gaitonde and Mossel (2024); Chandrasekaran and Klivans (2024), or when the bulk of its eigenvalues lie in a small interval Anari et al. (2024a); Koehler et al. (2024), but no unified framework for polynomial-time estimation in TV exists so far. Our main contribution is a unified analysis of the Maximum Pseudo-Likelihood Estimator (MPLE) for two general classes of Ising models. The first class includes models whose interaction matrix has a bounded operator norm. In particular, we focus on the subclass of models that satisfy the Modified Log-Sobolev Inequality (MLSI), a functional inequality that was introduced to study the convergence of the associated Glauber dynamics to stationarity. In the second class of models, the interaction matrix has bounded infinity norm (or bounded width), which is the most common assumption in the literature for structure learning of Ising models. We show how our general results for these classes yield polynomial-time algorithms and optimal or near-optimal sample complexity guarantees in a variety of settings. Our proofs employ a variety of tools from tensorization inequalities to measure decompositions and concentration bounds}
}


@InProceedings{pmlr-v336-deng26a,
  title = 	 {Stochastic Safe Action Model Learning},
  author =       {Deng, Zihao and Juba, Brendan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1715--1736},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/deng26a/deng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/deng26a.html},
  abstract = 	 {Hand-crafting models of interactive domains is challenging, especially when the dynamics of the domain are stochastic. We thus wish to automatically learn such models instead. In this work, we propose an algorithm to learn stochastic planning models where the distribution over the sets of effects for each action has a small support, but the sets may set values to an arbitrary number of attributes. This class captures many benchmark domains, in contrast to prior work that assumed independence of the effects on individual attributes. Our algorithm has polynomial time and sample complexity when the support size is bounded by a constant. Importantly, our method is safe in that we learn offline from example trajectories and we guarantee that actions are only permitted in states where our model of the dynamics is accurate. Moreover, we guarantee approximate completeness of the model, in the sense that if the examples are achieving goals from some distribution, then with high probability there will exist plans in our learned model that achieve goals from the same distribution}
}


@InProceedings{pmlr-v336-derezinski26a,
  title = 	 {The matrix-vector complexity of Ax=b},
  author =       {Derezi{\'n}ski, Micha{\l} and Epperly, Ethan N and Meyer, Raphael A},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1737--1770},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/derezinski26a/derezinski26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/derezinski26a.html},
  abstract = 	 {Matrix–vector algorithms, particularly Krylov subspace methods, are widely viewed as the most effective algorithms for solving large systems of linear equations. This paper establishes lower bounds on the worst-case number of matrix–vector products needed by such an algorithm to approximately solve a general linear system. The first main result is that, for any matrix–vector algorithm which is allowed the use of randomization and can perform products with both a matrix and its transpose, $\Omega(\kappa \log(1/\varepsilon))$ matrix–vector products are necessary to solve a linear system with condition number $\kappa$ to accuracy $\varepsilon$, matching an upper bound for conjugate gradient on the normal equations. The second main result is that one-sided algorithms, which lack access to the transpose, must use $n$ matrix–vector products to solve an $n \times n$ linear system, even when the problem is perfectly conditioned. Both main results include explicit constants that match known upper bounds up to a factor of four. These results rigorously demonstrate the limitations of matrix–vector algorithms and confirm the optimality of widely used Krylov subspace algorithms.}
}


@InProceedings{pmlr-v336-derezinski26b,
  title = 	 {Last-Iterate Convergence of Randomized Kaczmarz and SGD with Greedy Step Size},
  author =       {Derezi{\'n}ski, Micha{\l} and Dong, Xiaoyu},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1771--1813},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/derezinski26b/derezinski26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/derezinski26b.html},
  abstract = 	 {We study last-iterate convergence of SGD with greedy step size over smooth quadratics in the interpolation regime, a setting which captures the classical Randomized Kaczmarz algorithm as well as other popular iterative linear system solvers. For these methods, we show that the $t$-th iterate attains an $O(1/t^{3/4})$ convergence rate, addressing a question posed by Attia, Schliserman, Sherman, and Koren, who gave an $O(1/t^{1/2})$ guarantee for this setting. In the proof, we introduce the family of stochastic contraction processes, whose behavior can be described by the evolution of a certain deterministic eigenvalue equation, which we analyze via a careful discrete-to-continuous reduction. }
}


@InProceedings{pmlr-v336-diakonikolas26a,
  title = 	 {High-Dimensional Gaussian Mean Estimation under Realizable Contamination},
  author =       {Diakonikolas, Ilias and Kane, Daniel M. and Pittas, Thanasis},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1814--1856},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/diakonikolas26a/diakonikolas26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/diakonikolas26a.html},
  abstract = 	 {We study mean estimation for a Gaussian distribution with identity covariance in $\mathbb{R}^d$ under a missing data scheme termed realizable $\epsilon$-contamination. In this model, an adversary chooses a function $r(x)$ taking values in $[0,\epsilon]$, and each sample $x$ is removed independently with probability $r(x)$. Recent work introduced this model as an intermediate-strength setting between Missing Completely At Random (MCAR), where missingness is independent of the data, and Missing Not At Random (MNAR), where missingness may depend arbitrarily on the sample values and can lead to non-identifiability. Prior work established information-theoretic upper and lower bounds for mean estimation in the realizable contamination model, but the proposed estimators require runtime exponential in the dimension, leaving open the possibility of computationally efficient algorithms in high dimensions. In this work, we establish an information–computation gap in the Statistical Query model and, as a consequence, for low-degree polynomial and polynomial-threshold-function algorithms. Specifically, we show that any such algorithm must either use substantially more samples than information-theoretically necessary or incur exponential runtime. We complement our lower bound with an algorithm whose sample–time tradeoff nearly matches our lower bound. Together, these results provide a qualitative characterization of the computational complexity of Gaussian mean estimation under realizable $\epsilon$-contamination.}
}


@InProceedings{pmlr-v336-diakonikolas26b,
  title = 	 {Linear Regression under Missing or Corrupted Coordinates},
  author =       {Diakonikolas, Ilias and Diakonikolas, Jelena and Kane, Daniel M. and Lee, Jasper C. H. and Pittas, Thanasis},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1857--1901},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/diakonikolas26b/diakonikolas26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/diakonikolas26b.html},
  abstract = 	 {We study multivariate linear regression under Gaussian covariates in two settings where data may be erased or corrupted by an adversary subject to a coordinate-wise budget. In the incomplete data setting, an adversary may inspect the dataset and delete entries in up to an $\eta$-fraction of samples per coordinate, yielding a strong form of the Missing Not At Random model. In the corrupted data setting, the adversary instead replaces values arbitrarily, and the corruption locations are unknown to the learner. Despite substantial work on missing data, linear regression under such adversarial missingness remains poorly understood, even from an information-theoretic perspective. Unlike the clean setting, where the estimation error vanishes as the number of samples grows, the optimal error in these models remains bounded away from zero and depends on the problem parameters. Our main contribution is a characterization of this error, up to constant factors, over essentially the entire parameter range. Specifically, we establish novel information-theoretic lower bounds on the achievable error and show that they match the guarantees of computationally efficient algorithms. A key implication of our results is that the optimal error in the missing data setting matches that in the corruption setting, indicating that knowledge of the corruption locations provides no general advantage.}
}


@InProceedings{pmlr-v336-diakonikolas26c,
  title = 	 {A Quasi-Polynomial Time Mean Estimator Under Mean-Shift Contamination with Unknown Covariance},
  author =       {Diakonikolas, Ilias and Gao, Jingyi and Iakovidis, Giannis and Kane, Daniel M. and Liu, Sihan and Pittas, Thanasis},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1902--1937},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/diakonikolas26c/diakonikolas26c.pdf},
  url = 	 {https://proceedings.mlr.press/v336/diakonikolas26c.html},
  abstract = 	 {We study the algorithmic problem of robust Gaussian mean estimation in the mean-shift contamination model with unknown covariance. Specifically, we are allowed to draw samples from a statistical mixture of an unknown target Gaussian $\mathcal{N}(\mu, \Sigma)$ (with weight at least $1-\alpha$), and arbitrary unknown mean-shifts of it, i.e., ${\mathcal{N}(\mu_i, \Sigma)}_i$, and the goal is to estimate $\mu$ up to any desired accuracy $\epsilon$ in $\ell_2$-norm. In the special case where $\Sigma$ is known to be the identity, prior work gave an algorithm with a near-optimal sample complexity of $\mathrm{poly}(d,2^{\epsilon^{-2}})$ and sample-polynomial time. In this work, we provide a quasi-polynomial time algorithm with sample complexity $2^{\mathrm{poly}(\log d/\epsilon)}$ in the more general unknown covariance case, markedly improving upon the only previously known estimator for this setting that incurs exponential runtime.}
}


@InProceedings{pmlr-v336-di-gregorio26a,
  title = 	 {Online Convex Optimization with Sublinear Noisy Probes},
  author =       {{Di Gregorio}, Simone and Gupta, Anupam and Leonardi, Stefano and Russo, Matteo},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1938--1962},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/di-gregorio26a/di-gregorio26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/di-gregorio26a.html},
  abstract = 	 { We study Online Convex Optimization (OCO) over a convex set $K\subseteq \mathbb R^d$, where in each round $t$ the learner selects $x_t\in K$ and then observes a convex loss $f_t:K\to[0,1]$, with the goal of minimizing regret to the best fixed decision in hindsight. We introduce a unified probing model that generalizes two recent lines of work: sublinear \emph{best-expert} queries in the experts setting, and pairwise (comparison-based) feedback available every round in OCO. In our framework, the learner has a budget of $k\le T$ \emph{pairwise probes}; on a probed round it may query two points and learn which one has smaller loss. Our main result shows that even a \emph{sublinear and noisy} probe budget can provably improve worst-case regret in the full feedback OCO regime. With $k$ $\delta$-noisy pairwise probes, we obtain: $ {\textup{\textsc{Reg}}}_T \le O\left(\min\left\{\sqrt{dT\ln T},; \frac{dT\ln T}{k|1-2\delta|}\right\}\right) $, which is tight (up to logarithmic factors in $T$) across $T$, $k$ and $\delta$. Specifically regarding the noise parameter $\delta \in [0,1]$,  the regret guarantee smoothly degrades as the oracle response approaches a coin flip, i.e., $\delta$ is close to $\frac{1}{2}$. When applying the same techniques to a finite $K$ for the prediction with $d$ experts setting, the resulting rates are instead completely tight in all parameters, including $d$. Our analysis gives a streamlined treatment of pairwise probing in OCO by quantifying the benefit of probing via a variance reduction effect, combined with a second-order (variance-based) analysis of Continuous Exponential Weights.}
}


@InProceedings{pmlr-v336-ding26a,
  title = 	 {Minimax optimal differentially private synthetic data for smooth queries},
  author =       {Ding, Rundong and He, Yiyun and Zhu, Yizhe},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1963--1964},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ding26a/ding26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ding26a.html},
  abstract = 	 {Differentially private synthetic data enables the sharing and analysis of sensitive datasets while providing rigorous privacy guarantees for individual contributors. A central challenge is to achieve strong utility guarantees for meaningful downstream analysis. Many existing methods ensure uniform accuracy over broad query classes, such as all Lipschitz functions, but this level of generality often leads to suboptimal rates for statistics of practical interest. Since many common data analysis queries exhibit smoothness beyond what worst-case Lipschitz bounds capture, we ask whether exploiting this additional structure can yield improved utility. We study the problem of generating $(\varepsilon,\delta)$-differentially private synthetic data from a dataset of size $n$ supported on the hypercube $[-1,1]^d$, with utility guarantees uniformly for all smooth queries having bounded derivatives up to order $k$. We propose a polynomial-time algorithm that achieves a minimax error rate of $O_{k,d}(n^{-\min {1, \frac{k}{d}}})$, up to a $\log(n)$ factor. This characterization uncovers a phase transition at $k=d$. Our results generalize the Chebyshev moment matching framework of (Musco et al., 2025; Wang et al., 2016) and strictly improve the error rates for $k$-smooth queries established in (Wang et al., 2016). Moreover, we establish the first minimax lower bound for the utility of $(\varepsilon,\delta)$-differentially private synthetic data with respect to $k$-smooth queries, extending the Wasserstein lower bound for $\varepsilon$-differential privacy in (Boedihardjo et al., 2024). }
}


@InProceedings{pmlr-v336-ding26b,
  title = 	 {Rate-optimal community detection near the {KS} threshold via node-robust algorithms},
  author =       {Ding, Jingqiu and Hua, Yiding and Lindberg, Kasper and Steurer, David and Storozhenko, Aleksandr},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {1965--2037},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ding26b/ding26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ding26b.html},
  abstract = 	 {We study community detection in the \emph{symmetric $k$-stochastic block model}, where $n$ nodes are evenly partitioned into $k$ clusters with intra- and inter-cluster connection probabilities $p$ and $q$, respectively. Our main result is a polynomial-time algorithm that achieves the optimal misclassification rate $\exp(-(1 \pm o(1)) C/k)$, where $C = (\sqrt{pn} - \sqrt{qn})^2$, whenever $C \geq K k^2 \log k$ for some universal constant $K$, matching the Kesten–Stigum ({KS}) threshold up to a $\log k$ factor. Notably, this rate holds even when an adversary corrupts an $\eta \leq \exp(-(1 \pm o(1)) C/k)$ fraction of the nodes. To the best of our knowledge, this optimal error rate was previously only attainable either via computationally inefficient procedures (Zhang and Zhou, 2015) or via polynomial-time algorithms that require strictly stronger assumptions such as $C \geq K k^3$ (Gao et al., 2017). In the node-robust setting, the best known algorithm requires the substantially stronger condition $C \geq K k^{102}$ (Liu and Moitra, 2022). Our results close this gap by providing the first polynomial-time algorithm that achieves the optimal error rate near the {KS} threshold in both settings. Our work has two key technical contributions: (1) we robustify majority voting via the Sum-of-Squares framework, (2) we develop a novel graph bisectioning algorithm via robust majority voting, which allows us to significantly improve the misclassification rate to $1/\mathrm{poly}(k)$ for the initial estimation near the {KS} threshold.}
}


@InProceedings{pmlr-v336-dmitriev26a,
  title = 	 {Efficient Sampling with Discrete Diffusion Models: Sharp and Adaptive Guarantees},
  author =       {Dmitriev, Daniil and Huang, Zhihan and Wei, Yuting},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2038--2104},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/dmitriev26a/dmitriev26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/dmitriev26a.html},
  abstract = 	 {Diffusion models over discrete spaces have recently shown striking empirical success, yet their theoretical foundations remain incomplete. In this paper, we study the sampling efficiency of score-based discrete diffusion models under a continuous-time Markov chain (CTMC) formulation, with a focus on $\tau$-leaping-based samplers. We establish sharp convergence guarantees for attaining $\varepsilon$ accuracy in Kullback-Leibler (KL) divergence for both uniform and masking noising processes.  For uniform discrete diffusion, we show that $\tau$-leaping achieves an iteration complexity of order $\tilde{O}(d/\varepsilon)$, with $d$ the ambient dimension of the target distribution, eliminating linear dependence on the vocabulary size $S$ and improving existing bounds by a factor of $d$; moreover, we establish a matching algorithmic lower bound showing that linear dependence on the ambient dimension is unavoidable in general.  For masking discrete diffusion, we introduce a modified $\tau$-leaping sampler whose convergence rate is governed by an intrinsic information-theoretic quantity, termed the effective total correlation, which is bounded by $d \log S$ but can be sublinear or even constant for structured data. As a consequence, the sampler provably adapts to low-dimensional structure without prior knowledge or algorithmic modification, yielding sublinear convergence rates for various practical examples (such as hidden Markov models, image data, and random graphs). Our analysis requires no boundedness or smoothness assumptions on the score estimator beyond control of the score entropy loss.}
}


@InProceedings{pmlr-v336-doron-arad26a,
  title = 	 {Online Realizable Regression and Applications for ReLU Networks},
  author =       {Doron-Arad, Ilan and Mehalel, Idan and Mossel, Elchanan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2105--2106},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/doron-arad26a/doron-arad26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/doron-arad26a.html},
  abstract = 	 {Realizable online regression can behave very differently from online classification. Even without any margin or stochastic assumptions, realizability may enforce horizon-free (finite) cumulative loss under metric-like losses, even when the analogous classification problem has an infinite mistake bound. We study realizable online regression in the adversarial model under losses that satisfy an approximate triangle inequality (approximate pseudo-metrics). Recent work of Attias et al., 2023 shows that the minimax realizable cumulative loss is characterized by the scaled Littlestone/online dimension $\mathbb{D}_{\mathrm{onl}}$, but this quantity can be difficult to analyze. Our main contribution is a generic potential method that upper bounds $\mathbb{D}_{\mathrm{onl}}$ by a concrete Dudley-type entropy integral that depends only on covering numbers of the hypothesis class under the induced sup pseudo-metric. For an hypothesis class $\mathcal{H}$, we define an entropy potential $\Phi(\mathcal{H})=\int_{0}^{\operatorname{diam}(\mathcal{H})} \log N(\mathcal{H},\varepsilon) d\varepsilon$, where $N(\mathcal{H},\varepsilon)$ is the $\varepsilon$-covering number of $\mathcal{H}$ under the sup pseudo metric $\sup_x \ell(f(x),g(x))$, and show that for every $c$-approximate pseudo-metric loss it holds that $\mathbb{D}_{\mathrm{onl}}(\mathcal{H})\le O(c \cdot \Phi(\mathcal{H}))$. In particular, polynomial metric entropy implies $\Phi(\mathcal{H})<\infty$ and hence a horizon-free realizable cumulative-loss bound with transparent dependence on effective dimension. We illustrate the method on two families. For the class $\mathcal{H}_L$ of all $L$-Lipschitz functions on $[-1,1]^d$ under the loss $\ell_q(y,y’)=|y-y’|^q$, we establish a sharp phase transition: if $q>d$ then $\mathbb{D}_{\mathrm{onl}}(\mathcal{H}_L)=\Theta_{d,q}(L^d)$, and the bound is achievable efficiently, whereas if $q\le d$ then $\mathbb{D}_{\mathrm{onl}}(\mathcal{H}_L)=\infty$. Complementing these metric-specific results, for any continuous loss with $\ell(y,y)=0$, the loss along realizable sequences in $\mathcal{H}_L$ satisfies $\ell(\hat y_t,y_t)\to 0$ as $t\to\infty$. As a second application, we study bounded-norm $k$-ReLU networks over $[-1,1]^d$ with squared loss and highlight a regression–classification separation: realizable online classification is impossible already for $k=2,d=1$ under $0/1$ loss, yet realizable regression admits finite total loss, including a $\widetilde O(k^2)$ cumulative-loss upper bound, a matching lower bound up to polylogarithmic factors, and an efficient $O(1)$ guarantee for a single ReLU ($k=1$) independent of the input dimension $d$. Assuming Gap-ETH, we also rule out efficient proper online learners that achieve realizable accumulated loss $\widetilde o(d)$ for any constant $k\ge 2$.}
}


@InProceedings{pmlr-v336-dughmi26a,
  title = 	 {Relatively Smart: A New Approach for Instance-Optimal Learning},
  author =       {Dughmi, Shaddin and Pour, Alireza F.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2107--2144},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/dughmi26a/dughmi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/dughmi26a.html},
  abstract = 	 {We revisit the framework of \emph{Smart PAC learning}, which seeks supervised learners which compete with semi-supervised learners that are provided full knowledge of the \emph{marginal} distribution on unlabeled data. Prior work has shown that such marginal-by-marginal guarantees are possible for “most” marginals, with respect to an arbitrary fixed and known measure, but not more generally. We discover that this failure can be attributed to an “indistinguishability” phenomenon: There are marginals which cannot be statistically distinguished from other marginals that require different learning approaches. In such settings, semi-supervised learning cannot certify its guarantees from unlabeled data, rendering them arguably non-actionable.  We propose \emph{relatively smart learning}, a new framework which  demands that a supervised learner compete only with the best “certifiable” semi-supervised guarantee. We show that such modest relaxation suffices to bypass the impossibility results from prior work. In the distribution-free setting, we show that the One-Inclusion Graph learner is relatively smart up to squaring the sample complexity,  and show that no supervised learning algorithm can do better. For distribution-family settings, we show that relatively smart learning can be impossible or can require idiosyncratic learning approaches, and its difficulty can be non-monotone in the inclusion order on distribution families.}
}


@InProceedings{pmlr-v336-dutta26a,
  title = 	 {The Median is Easier than it Looks: Approximation with a Constant-Depth, Linear-Width ReLU Network},
  author =       {Dutta, Abhigyan and Safran, Itay and Valiant, Paul},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2145--2199},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/dutta26a/dutta26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/dutta26a.html},
  abstract = 	 {We study the approximation of the median of $d$ inputs using ReLU neural networks. We present depth-width tradeoffs under several settings, culminating in a constant-depth, linear-width construction that achieves exponentially small approximation error with respect to the uniform distribution over the unit hypercube. By further establishing a general reduction from the maximum to the median, our results break a barrier suggested by prior work on the maximum function, which indicated that linear width should require depth growing at least as $\log \log d$ to achieve comparable accuracy. Our construction relies on a multi-stage procedure that iteratively eliminates non-central elements while preserving a candidate set around the median. We overcome obstacles that do not arise for the maximum to yield approximation results that are strictly stronger than those previously known for the maximum itself.}
}


@InProceedings{pmlr-v336-el-cheairi26a,
  title = 	 {Theoretical Compression Bounds for Wide Multilayer Perceptrons},
  author =       {El Cheairi, Houssam and Gamarnik, David and Mazumder, Rahul},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2200--2258},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/el-cheairi26a/el-cheairi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/el-cheairi26a.html},
  abstract = 	 {Pruning and quantization techniques have been broadly successful in reducing the number of parameters needed for large neural networks, yet theoretical justification for their empirical success falls short. We consider a randomized greedy compression algorithm for pruning and quantization post-training and use it to rigorously show the existence of pruned/quantized subnetworks of multilayer perceptrons (MLPs) with competitive performance. We further extend our results to structured pruning of MLPs and convolutional neural networks (CNNs), thus providing a unified analysis of pruning in wide networks. Our results are free of data assumptions, and showcase a tradeoff between compressibility and  network width. The algorithm we consider bears some similarities with Optimal Brain Damage (OBD) and can be viewed as a post-training randomized version of it. The theoretical results we derive bridge the gap between theory and application for pruning/quantization, and provide a justification for the empirical success of compression in wide multilayer perceptrons.}
}


@InProceedings{pmlr-v336-eldowa26a,
  title = 	 {Leveraging Similarities in Multi-Armed Bandits},
  author =       {Eldowa, Khaled and Rahier, Thibaud and Cablant, Augustin and Mertikopoulos, Panayotis and Gaillard, Pierre},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2259--2306},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/eldowa26a/eldowa26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/eldowa26a.html},
  abstract = 	 {In many online learning and bandit problems, the actions we consider possess inherent similarities–for instance because they share latent traits, tags, or hierarchical structure. We study online learning with a similarity-structured action set, encoded by a rooted tree whose leaves are the actions and whose levels quantify how closely two actions are related. The loss sequence is assumed tree-compatible: losses of similar actions are constrained to be close. We establish an impossibility result showing that usual one-point bandit feedback cannot, in general, leverage range or tree-induced similarity, even under very strong similarity constraints. We then provide a unified set of algorithms which adapt to a wide range of richer feedback models, from semi-bandit feedback down to multi-point bandit protocols, including the minimal two-point feedback setting. We show these algorithms exhibit best-of-both-worlds guarantees and provably exploit action similarities by replacing the number of actions $K$ by a similarity-aware effective number of actions $K_{\mathrm{eff}}$ in the regret bounds. As an application, we show that under two-point feedback, it is possible to achieve $\sqrt{T}$ regret in Lipschitz bandits when $d \leq 2$.}
}


@InProceedings{pmlr-v336-erez26a,
  title = 	 {The Sample Complexity of Multiclass and Sparse Contextual Bandits},
  author =       {Erez, Liad and Chen, Fan and Cohen, Alon and Koren, Tomer and Mansour, Yishay and Moran, Shay and Rakhlin, Alexander},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2307--2338},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/erez26a/erez26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/erez26a.html},
  abstract = 	 {We study contextual bandits in the stochastic i.i.d. setting, where a learner observes contexts drawn from an unknown distribution, selects actions from a finite set $\mathcal{A}$, and aims to identify an approximately optimal policy from a given class based on bandit feedback. Motivated by the important special case of bandit multiclass classification with zero-one rewards, we focus on the \emph{$s$-sparse} setting in which, for every context, the underlying reward vector has $L_1$-norm at most $s \ll |\mathcal{A}|$. Our main result is the design of algorithms that, with probability at least $1-\delta$, output an $\varepsilon$-optimal policy compared to policy class $\Pi$ using \begin{align*} \widetilde{O} \left( \left( \frac{s}{\varepsilon^2} + \frac{|\mathcal{A}|}{\varepsilon}\right) \log \frac{|\Pi|}{\delta}\right) \end{align*} samples.  We further extend this bound to general Natarajan classes and complement it with a matching lower bound (up to logarithmic factors), thereby closing a substantial gap left by prior work (Erez et al., 2024a,b; Erez and Koren, 2025), which incurred an additional $\Theta(|\mathcal{A}|^9)$ dependence. We obtain these results via two complementary approaches. First, we analyze contextual bandits through the lens of contextual decision making with structured observations, designing an exploration-by-optimization algorithm whose sample complexity is governed by the \emph{decision-estimation coefficient} (DEC; Foster et al., 2021, 2022). We show that, with $s$-sparse rewards, the induced model class admits a sharp DEC bound that scales with $s$ and directly yields the optimal rate. Since this approach is largely information-theoretic and involves solving complex min-max optimization problems, we also develop a second, more specialized algorithmic method based on a low-variance exploration technique. This approach leads to concrete, tractable algorithms and naturally extends to contextual combinatorial semi-bandits, leading to improved sample complexity guarantees for bandit multiclass list classification.}
}


@InProceedings{pmlr-v336-essakine26a,
  title = 	 {Tight Sample Complexity Bounds for Entropic Best Policy Identification},
  author =       {Essakine, Amer and Vernade, Claire},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2339--2398},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/essakine26a/essakine26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/essakine26a.html},
  abstract = 	 {We study best-policy identification for finite-horizon risk-sensitive reinforcement learning under the entropic risk measure. Recent work established a constant gap in the exponential horizon dependence between lower and upper bounds on the number of samples required to identify an approximately optimal policy. Precisely, known lower bounds scale in $\Omega(\exp(|\beta|H))$ where $H$ is the horizon of the MDP, while the state-of-the-art upper bound achieves at best $O(\exp(2|\beta|H))$ (Mortensen and Talebi, 2025) using a generative model. We show that this extra exponential factor can be traced to overly loose concentration control for exponential utilities. To close this open gap, we revisit the analysis of this problem through a forward-model-based algorithm building on KL-based exploration bonuses that we adapt to the entropic criterion. The improvement we get is due to two main novel technical innovations. We leverage the smoothness properties of the exponential utility to derive sharper concentration bounds, and we propose a new stopping rule that exploits further this tightness to obtain a sample complexity that matches the lower bound.}
}


@InProceedings{pmlr-v336-farina26a,
  title = 	 {Defensive Generation},
  author =       {Farina, Gabriele and Perdomo, Juan Carlos},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2399--2427},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/farina26a/farina26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/farina26a.html},
  abstract = 	 {We study the problem of efficiently producing, in an online fashion, generative models of scalar, multiclass, and  vector-valued outcomes that cannot be falsified on the basis of the observed data and a pre-specified collection of computational tests.  Our contributions are twofold. First, we expand on connections between online high-dimensional multicalibration with respect to an RKHS and recent advances in expected variational inequality problems, enabling efficient algorithms for the former. We then apply this algorithmic machinery to the problem of outcome indistinguishability. Our procedure, Defensive Generation, is the first to efficiently produce online outcome indistinguishable generative models of non-Bernoulli outcomes that are unfalsifiable with respect to infinite classes of tests, including those that examine higher-order moments of the generated distributions. Furthermore, our method runs in near-linear time in the number of samples and achieves the optimal, vanishing $1/\sqrt T$ rate for generation error.}
}


@InProceedings{pmlr-v336-filmus26a,
  title = 	 {Optimal Reconstruction from Linear Queries},
  author =       {Filmus, Yuval and Moran, Shay and Nesterova, Elizaveta},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2428--2476},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/filmus26a/filmus26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/filmus26a.html},
  abstract = 	 {We study the problem of reconstructing an unknown point in $\mathbb{R}^d$ from approximate linear queries. This setting arises naturally in applications ranging from low-dimensional remote sensing and signal recovery to high-dimensional data analysis and privacy-sensitive inference. Our main goal is to characterize the optimal \emph{reconstruction error} as a function of the number of queries $T$, the ambient dimension $d$, and the noise parameter $\delta$. We first analyze the limit $T \to \infty$ and show that the optimal reconstruction error converges to the explicit value $\sqrt{2d/(d+1)}\,\delta$, which plays a role analogous to the Bayes optimal error in supervised learning. When the dimension is fixed, we show that the excess error above this limit decays \emph{doubly exponentially} fast as $T \to \infty$, a rate that is significantly faster than those typically encountered in learning curves. When the dimension grows, we show that a number of queries on the order of $\exp(d)$ is necessary and sufficient to achieve vanishing excess error. Finally, we introduce and analyze an improper variant of the reconstruction problem. From a technical perspective, our main contribution is a generalization of Jung’s theorem (1901). The classical theorem bounds the maximum possible radius of a set of diameter 1 and characterizes extremal bodies. Our generalization provides a robust variant that characterizes near-extremal bodies and is proved via geometric and dynamical arguments exploiting symmetry and Lie group actions.}
}


@InProceedings{pmlr-v336-flammarion26a,
  title = 	 {Space-Efficient Language Generation in the Limit},
  author =       {Flammarion, Nicolas and Pabbaraju, Chirag and Papazov, Hristo and Stouras, Miltiadis and Svensson, Ola},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2477--2502},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/flammarion26a/flammarion26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/flammarion26a.html},
  abstract = 	 {We initiate a resource-aware theory of \textit{language generation in the limit} under the minimal constraint of space efficiency. In our framework, a learner observes an adversarial positive stream from a target language $K$ and must eventually output a hallucination-free hypothesis language $L \subseteq K$ while omitting at most $\Delta$ strings of $K$. We focus on $\mathcal{C}_{s,k}$, the collection of languages recognized by DFAs with at most $s$ states over an alphabet of size $k$, as the natural hypothesis class for memory-bounded learners. In the exponential-space regime, we prove that a learner can exactly identify the target $K$. Under a stricter memory budget, we characterize the strongest possible generation guarantees. In particular, we present a streaming algorithm using $\mathrm{poly}(s,k)$ space that converges to a hypothesis with generation gap $\Delta = O(k^{2s-2})$. Moreover, the learned hypothesis captures every string in $K$ of length at least $2s-1$. We complement this result with a near-matching lower bound through a reduction from a standard communication complexity problem. Specifically, achieving generation gap $\Delta \le k^{(1-\varepsilon)s}$ requires $k^{\Omega(\varepsilon s)}$ memory. Together, these results reveal a sharp transition between polynomial-space generation and exponential-space exact identification.}
}


@InProceedings{pmlr-v336-frongillo26a,
  title = 	 {Toward Simultaneously Optimal Regret in U-Calibration},
  author =       {Frongillo, Rafael and Luo, Haipeng and Mehta, Nishant A. and Schneider, Jon},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2503--2534},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/frongillo26a/frongillo26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/frongillo26a.html},
  abstract = 	 {U-calibration studies online forecasting algorithms whose predictions can be consumed by any unknown downstream agent, guaranteeing sublinear regret simultaneously for all proper loss functions. Existing U-calibration algorithms achieve worst-case optimal $O(\sqrt{T})$ regret for every bounded proper loss, but they fail to adapt to easier losses: as we show, even for smooth losses such as squared loss, they incur $\Omega(\sqrt{T})$ regret instead of the optimal $O(\log T)$ regret. In this work, we show that this limitation is not inherent. Specifically, we design a single forecast algorithm that simultaneously achieves $\tilde O(\sqrt{T})$ regret for every bounded proper loss and $O(\log T)$ regret for every bounded smooth proper loss. More generally, our algorithm also attains logarithmic regret for losses that are smooth relative to the log-barrier, which include several non-Lipschitz examples. Our approach is based on a novel variant of Follow-the-Perturbed-Leader (FTPL) in which perturbations are applied directly in the prediction space using \emph{self-concordant noise}. The resulting analysis also departs substantially from prior FTPL analyses due to the complex nature of this noise and may be of independent interest.}
}


@InProceedings{pmlr-v336-gaitonde26a,
  title = 	 {{Learning Ising Models from Evolutions (Extended Abstract)}},
  author =       {Gaitonde, Jason and Moitra, Ankur and Mossel, Elchanan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2535--2536},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gaitonde26a/gaitonde26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gaitonde26a.html},
  abstract = 	 {In this work, we revisit the problem of learning the structure and parameters of an Ising model from dynamics. While the problem of learning from i.i.d. samples has been intensively studied in several communities, recent work has considered learning from temporally correlated samples arising from some stochastic process. However, all prior work studied this problem in the {\em synthetic} observation model that assumes knowledge of internal steps of the standard algorithm for generating samples, which goes far beyond what we should expect to naturally observe from the system evolution in important physics and network applications. Extending these algorithmic guarantees to more realistic observation models has been an important direction highlighted in recent work (Bresler, Gamarnik, Shah IEEE Trans. Inf. Theory 2018, Gaitonde, Moitra, Mossel STOC 2025). We give the first efficient algorithm for learning from the natural continuous-time observation model where we only observe the actual evolution of the state of the system, as opposed to usually unobservable details like failed update attempts of sites. For Ising models with maximum degree $d$, our algorithm first recovers the graph structure in $\mathsf{poly}(d)\cdot n^2\log n$ time, which qualitatively matches the state-of-the-art even in the cleaner i.i.d. setting, and then estimates the parameters in additional $\widetilde{O}(2^d\cdot  n)$ time. Our analysis is based on a new family of cycle statistics, which crucially remains measurable for \emph{any} stochastic process, and in fact succeeds more generally for a broad family of reversible, single-site Markov chains that includes both the Glauber dynamics and the Metropolis chain.}
}


@InProceedings{pmlr-v336-gamarnik26a,
  title = 	 {Optimal Hardness of Online Algorithms for Large Common Induced Subgraphs},
  author =       {Gamarnik, David and R{\'a}cz, Mikl{\'o}s Z. and Schoenbach, Gabe},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2537--2560},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gamarnik26a/gamarnik26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gamarnik26a.html},
  abstract = 	 { We study the problem of efficiently finding large common induced subgraphs of two independent Erdős–Rényi random graphs $G_1, G_2 \sim \mathbb{G}(n,1/2)$. Recently, Chatterjee and Diaconis (2023) showed that the largest common induced subgraph of $G_1$ and $G_2$ has size $(4-o(1))\log_2 n$ with high probability. We first show that a simple greedy online algorithm finds a common induced subgraph of $G_1$ and $G_2$ of size $(2-o(1)) \log_2 n$ with high probability. Our main result shows that no online algorithm can find a common induced subgraph of $G_1$ and $G_2$ of size at least $(2+\varepsilon) \log_2 n$ with probability bounded away from $0$ as $n \to \infty$. Together, these results provide evidence that this problem exhibits a computation-to-optimization gap. To prove the impossibility result, we show that the solution space of the problem exhibits a version of the (multi) overlap gap property (OGP), and utilize an interpolation argument recently developed by Gamarnik, K{ı}z{ı}ldağ, and Warnke (2025) that connects OGP and online algorithms.   }
}


@InProceedings{pmlr-v336-genans26a,
  title = 	 {Fast and Large-Scale Unbalanced Optimal Transport via its Semi-Dual and Adaptive Gradient Methods},
  author =       {Genans, Ferdinand},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2561--2600},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/genans26a/genans26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/genans26a.html},
  abstract = 	 {Unbalanced Optimal Transport (UOT) has emerged as a robust relaxation of standard Optimal Transport, particularly effective for handling outliers and mass variations. However, scalable algorithms for UOT, specifically those based on Gradient Descent (SGD), remain largely underexplored. In this work, we address this gap by analyzing the semi-dual formulation of Entropic UOT and demonstrating its suitability for adaptive gradient methods. While the semi-dual is a standard tool for large-scale balanced OT, its geometry in the unbalanced setting appears ill-conditioned under standard analysis. Specifically, worst-case bounds on the marginal penalties using $\chi^2$ divergence suggest a condition number scaling with $n/\varepsilon$, implying poor scalability. In contrast, we show that the local condition number actually scales as $\mathcal{O}(1/\varepsilon)$, effectively removing the ill-conditioned dependence on $n$. Exploiting this property, we prove that SGD methods adapt to this local curvature, achieving a convergence rate of $\mathcal{O}(n/\varepsilon T)$ in the stochastic and online regimes, making it suitable for large-scale and semi-discrete applications. Finally, for the full batch discrete setting, we derive a nearly tight upper bound on local smoothness depending solely on the gradient. Using it to adapt step sizes, we propose a modified Adaptive Nesterov Accelerated Gradient (ANAG) method  on the semi-dual functional and prove that it achieves a local complexity of $\mathcal{O}(n^2\sqrt{1/\varepsilon}\ln(1/\delta))$.}
}


@InProceedings{pmlr-v336-ghazi26a,
  title = 	 {Nearly Linear-Time User-Level DP-SCO with Optimal Rates},
  author =       {Ghazi, Badih and Kumar, Ravi and Liu, Daogao and Manurangsi, Pasin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2601--2636},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ghazi26a/ghazi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ghazi26a.html},
  abstract = 	 {User-level differentially private (DP) stochastic convex optimization has garnered significant attention due to the paramount importance of safeguarding user privacy in modern large-scale ML applications.  Current methods, such as those based on DP stochastic gradient descent (SGD), often struggle with high gradient computation complexity or suboptimal utility due to the need to privatize every intermediate iterate.  In this work, we introduce a new nearly linear-time algorithm that resolves this trade-off and achieves the optimal excess rates via an adaptive outlier removal framework. Our key innovation is integrating the sparse vector technique directly into the SGD loop, supported by a novel robust divergence analysis. This approach naturally bounds the sensitivity of gradient estimates without requiring privatization of all intermediate steps.  Specifically, our mechanism computes a local concentration score to adaptively filter out users whose updates diverge from the population geometry. Crucially, this  approach preserves the unbiasedness of the gradient estimate in well-concentrated regimes while strictly bounding sensitivity in the presence of outliers. We also explore extensions to the $\ell_\infty$ setting demonstrating the generality of our analysis.}
}


@InProceedings{pmlr-v336-ghazi26b,
  title = 	 {Fixed-Parameter Tractability of Private Synthetic Data Generation},
  author =       {Ghazi, Badih and Guzm{\'a}n, Crist{\'o}bal and Kamath, Pritish and Knop, Alexander and Kumar, Ravi and Manurangsi, Pasin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2637--2637},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ghazi26b/ghazi26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ghazi26b.html},
  abstract = 	 {We study the problem of generating synthetic data under differential privacy. We establish fixed-parameter tractability (FPT) for this problem where the parameter is the treewidth of the query family’s incidence graph. Our algorithms attain  optimal error rates across all regimes and are realized by two different approaches: the first is based on linear programming (LP) and the FPT of the separation problem for the LP dual; the second is based on a subsampled private multiplicative weights method, where we obtain FPT for sampling from Gibbs distributions. Both approaches are unified by a dynamic programming framework over a tree decomposition.}
}


@InProceedings{pmlr-v336-gheissari26a,
  title = 	 {Universality of high-dimensional scaling limits of stochastic gradient descent (extended abstract)},
  author =       {Gheissari, Reza and Jagannath, Aukosh},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2638--2638},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gheissari26a/gheissari26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gheissari26a.html},
  abstract = 	 {    We consider statistical tasks in high dimensions whose loss depends on the data only through its projection into a fixed-dimensional subspace spanned by the parameter vectors and certain ground truth vectors. This includes classifying mixture distributions with cross-entropy loss with one and two-layer networks, and learning single and multi-index models with one and two-layer networks.  When the data is drawn from an isotropic Gaussian mixture distribution, it is known that the evolution of a finite family of summary statistics under stochastic gradient descent  converges to an autonomous ordinary differential equation (ODE), as the dimension and sample size go to $\infty$ and the step size goes to $0$ commensurately. Our main result is that these ODE limits are universal in that this limit is the same whenever the data is drawn from mixtures of arbitrary product distributions whose first two moments match the corresponding Gaussian distribution, provided the initialization and ground truth vectors are coordinate-delocalized.  We complement this by proving two corresponding non-universality results. We provide a simple example where the ODE limits are non-universal if the initialization is coordinate aligned.  We also show that the stochastic differential equation limits arising as fluctuations of the summary statistics around their ODE’s fixed points are not universal.  }
}


@InProceedings{pmlr-v336-giapitzakis26a,
  title = 	 {On the Statistical Query Complexity of Learning Semiautomata: a Random Walk Approach},
  author =       {Giapitzakis, George and Fountoulakis, Kimon and Nichani, Eshaan and Lee, Jason D.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2639--2678},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/giapitzakis26a/giapitzakis26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/giapitzakis26a.html},
  abstract = 	 {Semiautomata form a rich class of sequence-processing algorithms with applications in natural language processing, robotics, computational biology, and data mining. We establish the first Statistical Query hardness result for semiautomata under the uniform distribution over input words and initial states.  We show that Statistical Query hardness can be established when both the alphabet size and input length are polynomial in the number of states. Unlike the case of deterministic finite automata, where hardness typically arises through the hardness of the language they recognize (e.g., parity), our result is derived solely from the internal state-transition structure of semiautomata. Our analysis reduces the task of distinguishing the final states of two semiautomata to studying the behavior of a random walk on the group $S_{N} \times S_{N}$. By applying tools from Fourier analysis and the representation theory of the symmetric group, we obtain tight spectral gap bounds, demonstrating that after a polynomial number of steps in the number of states, distinct semiautomata become nearly uncorrelated, yielding the desired hardness result.}
}


@InProceedings{pmlr-v336-gibbs26a,
  title = 	 {Sample-Efficient Omniprediction for Proper Losses},
  author =       {Gibbs, Isaac and Tibshirani, Ryan J.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2679--2719},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gibbs26a/gibbs26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gibbs26a.html},
  abstract = 	 {We study the problem of constructing probabilistic predictions that lead to effective decisions when employed by downstream users to inform actions. Given a single decision maker, developing an optimal predictor is equivalent to minimizing a proper loss function corresponding to the negative utility of that individual. For multiple decision makers, our problem can be viewed as a variant of omniprediction in which the goal is to develop a single predictor which simultaneously minimizes multiple losses. Existing algorithms for achieving omniprediction broadly fall into two categories: first, boosting methods, which optimize auxiliary targets such as multicalibration and obtain omniprediction as a corollary, and second, adversarial two-player game based approaches, which estimate and respond to the worst-case loss in an online fashion. We give lower bounds which demonstrate that multicalibration is a strictly more difficult problem than omniprediction and hence the first approach must incur suboptimal sample complexity. For the latter approach, we discuss how these ideas can be used to obtain a sample-efficient algorithm for our problem through an online-to-batch conversion. This conversion has the downside of returning a complex, randomized predictor. We therefore improve on this method by designing a more direct nonrandomized algorithm that exploits structural elements of the set of proper losses.}
}


@InProceedings{pmlr-v336-gobel26a,
  title = 	 {Robust Algorithms for Finding Cliques in Random Intersection Graphs via Sum-of-Squares},
  author =       {G{\"o}bel, Andreas and Ruff, Janosch and Schiller, Leon},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2720--2802},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gobel26a/gobel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gobel26a.html},
  abstract = 	 {We study efficient algorithms for recovering cliques in dense random intersection graphs (RIGs). In this model, $d = n^{\Omega(1)}$ cliques of size approximately $k$ are randomly planted by choosing the vertices to participate in each clique independently with probability $\delta$. While there has been extensive work on recovering one, or multiple disjointly planted cliques in random graphs, the natural extension of this question to recovering overlapping cliques has been, surprisingly, largely unexplored. Moreover, because every vertex can be part of polynomially many cliques, this task is significantly harder than in case of disjointly planted cliques (as recently studied by Kothari, Vempala, Wein and Xu [COLT’23]). In this work we obtain the first efficient algorithms for recovering the community structure of RIGs both from the perspective of exact and approximate recovery. Our algorithms are further robust to noise, monotone adversaries, a certain, optimal number of edge corruptions, and work whenever $k \gg \sqrt{n \log(n)}$. Our techniques follow the proofs-to-algorithms framework utilizing the sum-of-squares hierarchy. An essential component are certificates for the absence of large cliques outside of the ground-truth. Instead of spectral certificates, a central ingredient are modified versions of the biclique certificates, recently used for semi-random planted clique by Buhai, Kothari and Steurer [STOC’23]. To turn these certificates into robust and efficient algorithms that do not produce “false positives”, we rely on an extremely sharp concentration property for pseudo-distributions which might be of independent interest. Our techniques further extend to the related task of efficient \emph{refutation}, and lead to algorithms that can not only recover the ground-truth, but also certify the optimality of this clustering.}
}


@InProceedings{pmlr-v336-gobel26b,
  title = 	 {Information-Theoretic Thresholds for Bipartite Latent-Space Graphs Under Noisy Observations},
  author =       {G{\"o}bel, Andreas and Pappik, Marcus and Schiller, Leon},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2803--2803},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gobel26b/gobel26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gobel26b.html},
  abstract = 	 {We study information-theoretic phase transitions for the detectability of latent geometry in bipartite random geometric graphs (RGGs) with Gaussian $d$-dimensional latent vectors, while only a subset of edges carries latent information, determined by a random mask with i.i.d. $\mathsf{Bern}(q)$ entries. For any fixed edge density $p \in (0,1)$, we determine essentially tight thresholds for this problem as a function of $d$ and $q$. Our results show that the detection problem is substantially easier if the mask is known up-front, compared to the case where the mask is hidden. Our analysis is built upon a novel Fourier-analytic framework for bounding signed subgraph counts in Gaussian random geometric graphs that exploits cancellations. The resulting bounds are applicable to much larger sub-graphs, which enables tight information-theoretic bounds for all noise levels instead, while the bounds considered in previous works only lead to lower bounds from the lens of low-degree polynomials. As a consequence, we identify the optimal information-theoretic thresholds and rule out computational–statistical gaps. Our bounds further improve upon the bounds on Fourier coefficients of random geometric graphs recently given by Bangachev and Bresler [STOC’24] in the dense bipartite case. We believe the techniques to extend to sparser and non-bipartite settings as well, at least if the considered sub-graphs are sufficiently small, and that they might help resolve open questions in related sparse or noisy detection problems.}
}


@InProceedings{pmlr-v336-goel26a,
  title = 	 {Testing Noise Assumptions of Learning Algorithms},
  author =       {Goel, Surbhi and Klivans, Adam and Stavropoulos, Konstantinos and Vasilyan, Arsen},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2804--2853},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/goel26a/goel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/goel26a.html},
  abstract = 	 {We pose the following question in computational learning theory: \textit{can we efficiently test whether a training set satisfies the assumptions of a given noise model?} This question has remained unaddressed despite decades of research on learning in the presence of noise. In this work, we show that this task is tractable and present the first efficient algorithm to test various noise assumptions on the training data.  To model this question, we extend the recently proposed testable learning framework of Rubinfeld and Vasilyan (2023) and require a learner to run an associated test that satisfies the following two conditions: (1) whenever the test accepts, the learner outputs a classifier along with a \textit{certificate of optimality}, and (2) the test must pass for any dataset drawn according to a specified modeling assumption on both the marginal distribution and the noise model.  We then consider the problem of learning halfspaces over Gaussian marginals with Massart noise (where each label can be flipped with probability less than $1/2$ depending on the input features), and give a fully-polynomial time testable learning algorithm.  We also show a separation between the classical setting of learning in the presence of structured noise and testable learning. In fact, for the simple case of random classification noise (where each label is flipped with fixed probability $\eta = 1/2$), we show that testable learning requires super-polynomial time while classical learning is trivial.}
}


@InProceedings{pmlr-v336-gokhale26a,
  title = 	 {Compact Geometric Representations of Hierarchies},
  author =       {Gokhale, Prashant and Indyk, Piotr and Liu, Yuhao and Silwal, Sandeep and Wang, Tony and Xu, Haike},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2854--2877},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gokhale26a/gokhale26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gokhale26a.html},
  abstract = 	 {Computing geometric representations of data is a cornerstone of modern machine learning, typically achieved by training dual encoders which map queries and documents into a shared embedding space. Recent work of You et al. [NeurIPS ’25] has extended this approach to hierarchical retrieval, where relevance is determined by the ancestor-descendant relationships in a Directed Acyclic Graph (DAG). While previous work has shown that valid embeddings exist when the number of descendants is small, these bounds degrade significantly for deep hierarchies, requiring dimensions as large as the total number of nodes. In this paper, we investigate compact reachability embeddings for more general graph classes and provide theoretical guarantees for representing hierarchies using embeddings whose dimension depends on structural graph parameters. We prove that for any directed tree, there exists a reachability embedding in constant dimension 3, independent of the tree’s size or depth. We generalize this result to graphs characterized by treewidth $t$, constructing embeddings of dimension $O(t \log n)$, where $n$ is the number of nodes. Complementing these upper bounds, we provide matching or near-matching lower bounds, showing that dimension $\Omega(n)$ is necessary for general DAGs and $\Omega(t / \log(n/t))$ is required for graphs of treewidth $t$. We also obtain upper and lower bounds parameterized by the number of cross-edges in the DAG. We additionally show that our embeddings can be constructed on real-world datasets, and that they give much smaller dimensions in high recall regimes compared to prior embeddings with theoretical guarantees.}
}


@InProceedings{pmlr-v336-graur26a,
  title = 	 {Randomization for Faster Exact Optimization of Discounted Markov Decision Processes},
  author =       {Graur, Andrei and Sidford, Aaron and Tu, Ta-Wei},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2878--2900},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/graur26a/graur26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/graur26a.html},
  abstract = 	 {We provide faster running times for exactly solving discounted Markov Decision Processes (DMDPs) in strongly polynomial time. We obtain our results by efficiently reducing computing optimal values and policies in DMDPs to the easier tasks of policy evaluation and computing approximately optimal values. We provide both a straightforward deterministic reduction and a more efficient randomized variant that, together with advances in approximately solving DMDPs, yield our results.}
}


@InProceedings{pmlr-v336-gribling26a,
  title = 	 {Computing {Lewis} weights to high precision using local relative smoothness},
  author =       {Gribling, Sander and Sidford, Aaron and Zhang, Chenyi},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2901--2939},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gribling26a/gribling26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gribling26a.html},
  abstract = 	 {We provide algorithms that compute $\epsilon$-estimates of the $\ell_p$-Lewis weights of a matrix $A \in \mathbb{R}^{m \times n}$ for $p \geq 4$ using $O(p^2 \log(m/\epsilon))$ rounds of leverage score computation, where $\ell_p$-Lewis weights and leverage scores are both standard measures of row importance. This improves upon the state-of-the-art round complexity of $O(p^3 \log(m/\epsilon))$ due to Fazel, Lee, Padmanabha, and Sidford (2022). We obtain our results by carefully applying a local variant of relatively smooth gradient descent to primal and dual forms of the $\ell_p$-Lewis weight optimization problem and providing tools to convert between different notions of approximate $\ell_p$-Lewis weights. This work subsumes the note “On computing approximate Lewis weights” by Apers, Gribling, and Sidford.}
}


@InProceedings{pmlr-v336-gu26a,
  title = 	 {A Unified Lower Bound on the Noisy Query Complexity of Boolean Functions},
  author =       {Gu, Yuzhou and Li, Xin and Xu, Yinzhan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2940--2962},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gu26a/gu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gu26a.html},
  abstract = 	 {We study the query complexity of Boolean functions $f: {0, 1}^n \rightarrow {0, 1}$ in the noisy query model introduced by Feige, Raghavan, Peleg and Upfal [SICOMP 1994]. In this model, an algorithm can adaptively query the bits of an input vector, but each query result is independently flipped with constant probability $p \in (0, 1/2)$; repeated queries are allowed. The noisy query complexity $\mathsf{N}_p(f)$ of a function $f$ is defined as the minimum expected number of queries needed to compute $f(x)$ with error probability at most $1/3$, for the worst case input $x$. We prove a general lower bound on $\mathsf{N}_p(f)$ based on degree statistics of certain subgraphs of the Boolean hypercube. This is the first general lower bound beyond those implied by the simple observation that $\mathsf{N}_p(f)$ is lower bounded by the randomized query complexity. We show that this recovers (up to a constant factor) most previously known lower bounds on the noisy query complexity of Boolean functions, providing a unified framework for understanding these results and simplifying the proofs in several cases. Furthermore, this resolves in the affirmative an open problem of Gu, Li and Xu [COLT 2025] that $\mathsf{N}_p(f) = \Omega(\mathsf{I}(f) \log \mathsf{I}(f))$, where $\mathsf{I}(f)$ denotes the total influence of $f$. We also apply our general lower bound to obtain tight bounds on the noisy query complexity for several new functions. }
}


@InProceedings{pmlr-v336-gu26b,
  title = 	 {Functional Stochastic Localization},
  author =       {Gu, Anming and Shi, Bobby and Tian, Kevin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {2963--3004},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/gu26b/gu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/gu26b.html},
  abstract = 	 {Eldan’s stochastic localization is a probabilistic construction that has proved instrumental to modern breakthroughs in high-dimensional geometry and the design of sampling algorithms. Motivated by sampling under non-Euclidean geometries and the mirror descent algorithm in optimization, we develop a functional generalization of Eldan’s process that replaces Gaussian regularization with regularization by any positive integer multiple of a log-Laplace transform. We further give a mixing time bound on the Markov chain induced by our localization process, which holds if our target distribution satisfies a functional Poincaré inequality. Finally, we apply our framework to differentially private convex optimization in $\ell_p$ norms for $p \in [1, 2)$, where we improve state-of-the-art query complexities in a zeroth-order model.}
}


@InProceedings{pmlr-v336-ha26a,
  title = 	 {High Probability Convergence Guarantees of Stochastic Gradient Descent Ascent in Structured Nonconvex Min-Max Games},
  author =       {Ha, Junsoo},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3005--3075},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ha26a/ha26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ha26a.html},
  abstract = 	 {Nonconvex min-max optimization is a cornerstone of modern machine learning. However, its theoretical foundations remain largely limited to in-expectation convergence guarantees, which fail to capture the failure probability of individual training trajectories, particularly in the presence of heavy-tailed noise. In this work, we bridge this gap by establishing the first high-probability convergence guarantees of stochastic gradient descent-ascent (SGDA) in structured nonconvex games, specifically nonconvex-PL (NC-PL) and nonconvex-concave (NC-C) problems. We derive high-probability convergence rates of SGDA matching the best known in-expectation rates in the subgaussian noise regime. Then, we investigate the heavy-tailed noise regime and prove that SGDA cannot guarantee high-probability convergence in general. Finally, we analyze a gradient-clipped variant, SGDA\textsubscript{Clip}, and show that it recovers high-probability convergence guarantees in both NC-PL and NC-C games. Our analysis is based on novel progress quantities that simultaneously bound stationarity and primal-dual martingale terms, which yield self-bounding concentration bounds.}
}


@InProceedings{pmlr-v336-han26a,
  title = 	 {An Empirical {Bayes} Perspective on Heteroskedastic Mean Estimation},
  author =       {Han, Yanjun and Shetty, Abhishek and Shkrob, Jacob},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3076--3108},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/han26a/han26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/han26a.html},
  abstract = 	 {Towards understanding the fundamental limits of estimation from data of varied quality, we study the problem of estimating a mean parameter from heteroskedastic Gaussian observations where the variances are unknown and may vary across observations. While, with known variances, a simple linear estimator attains the smallest mean squared error, estimation without this knowledge is challenging due to the large number of nuisance parameters. We propose a simple and principled approach based on empirical Bayes: model the observations as if they were i.i.d. from a normal scale mixture and compute the profile maximum likelihood estimator (MLE) for the mean, treating the nonparametric mixing distribution as nuisance. Our result shows that this estimator achieves near-optimal error bounds across various heteroskedastic models in the literature. In particular, for the subset-of-signals problem where an unknown subset of observations has small variance, our estimator adaptively achieves the minimax rate for all signal sizes, including the sharp phase transition, without any tuning parameters.  One of our key technical steps is a sharper metric entropy bound for normal scale mixtures, obtained via generalized moment matching and Chebyshev approximation. This approach yields an improved polylogarithmic, rather than polynomial, dependence on problem parameters, which could be of independent interest.}
}


@InProceedings{pmlr-v336-hanashiro26a,
  title = 	 {Is {Multi-Distribution Learning} as Easy as {PAC Learning}: Sharp Rates with Bounded Label Noise},
  author =       {Hanashiro, Rafael and Shetty, Abhishek and Jaillet, Patrick},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3109--3142},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hanashiro26a/hanashiro26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hanashiro26a.html},
  abstract = 	 {Towards understanding the statistical complexity of learning from heterogeneous sources, we study the problem of multi-distribution learning. Given $k$ data sources, the goal is to output a classifier for each source by exploiting shared structure to reduce sample complexity. We focus on the bounded label noise setting to determine whether the fast $1/\epsilon$ rates achievable in single-task learning extend to this regime with minimal dependence on $k$. Surprisingly, we show that this is not the case. We demonstrate that learning across $k$ distributions inherently incurs slow rates scaling with $k/\epsilon^2$, even under constant noise levels, unless each distribution is learned separately. A key technical contribution is a structured hypothesis-testing framework that captures the statistical cost of certifying near-optimality under bounded noise–a cost we show is unavoidable in the multi-distribution setting. Finally, we prove that when competing with the stronger benchmark of each distribution’s optimal Bayes error, the sample complexity incurs a multiplicative penalty in $k$. This establishes a statistical separation between random classification noise and Massart noise, highlighting a fundamental barrier unique to learning from multiple sources.}
}


@InProceedings{pmlr-v336-harbuzova26a,
  title = 	 {Price of metric universality in vector quantization is at most 0.11 bit},
  author =       {Harbuzova, Alina and Ordentlich, Or and Polyanskiy, Yury},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3143--3183},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/harbuzova26a/harbuzova26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/harbuzova26a.html},
  abstract = 	 {Fast computation of a matrix product $W^\top X$ is a workhorse of modern LLMs. To make their deployment more efficient, a popular approach is that of using a low-precision approximation $\widehat W$ in place of true $W$ (“weight-only quantization”). Information theory demonstrates that an optimal algorithm  for reducing precision of $W$ depends on the (second order) statistics of $X$ and requires a careful alignment of vector quantization codebook with PCA directions of $X$ (a process known as “waterfilling allocation”). Dependence of the codebook on statistics of $X$, however, is highly impractical. This paper proves that there exist a universal codebook that is simultaneously near-optimal for all possible statistics of $X$, in the sense of being at least as good as an $X$-adapted waterfilling codebook with rate reduced by 0.11 bit per dimension in the case when $W$ is Gaussian. Such universal codebook would be an ideal candidate for the low-precision storage format, a topic of active modern research, but alas the existence proof is non-constructive. Equivalently, our result shows existence of a net in $\mathbb{R}^n$ that is a nearly-optimal covering of a sphere simultaneously with respect to all Hilbert norms. }
}


@InProceedings{pmlr-v336-harding26a,
  title = 	 {Learning from Biased and Costly Data Sources: Minimax-optimal Data Collection under a Budget (extended abstract)},
  author =       {Harding, Michael O. and Singh, Vikas and Kandasamy, Kirthevasan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3184--3184},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/harding26a/harding26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/harding26a.html},
  abstract = 	 {Data collection is a critical component of modern statistical and machine learning pipelines, particularly when data must be gathered from multiple heterogeneous sources to study a target population of interest. In many use cases, such as medical studies or political polling, different sources incur different sampling costs. Observations often have associated group identities—for example, health markers, demographics, or political affiliations—and the relative composition of these groups may differ substantially, both among the source populations and between sources and target population. Moreover, while group proportions are often known at the source and population levels, individual group membership may only be revealed after data collection. In this work, we study multi-source data collection under a fixed budget, focusing on the estimation of population means and group-conditional means. We show that naive data collection strategies (e.g. attempting to “match” the target distribution) or relying on standard estimators (e.g. sample mean) can be highly suboptimal. Instead, we develop a sampling plan which maximizes the <em>effective sample size</em>—the total sample size divided by $D_{\chi^2}(q\mid\mid\overline{p}) + 1$, where $q$ is the target distribution, $\overline{p}$ is the aggregated source distribution, and $D_{\chi^2}$ is the $\chi^2$-divergence. We pair this sampling plan with a classical post-stratified estimator, which is able to leverage large but systematically biased datasets, and upper bound its risk. We provide lower bounds with exactly matching leading terms, proving that our approach achieves the budgeted minimax-optimal risk up to additive lower order terms. Our techniques also extend to prediction problems when minimizing the excess risk. In this setting, we pair the effective-sample-size-maximizing sampling plan with a weighted empirical risk minimzer and upper bound its risk. A key contribution to this end is the development of a general information-theoretic lower bound framework for prediction problems under possibly differing source and target distributions, which may be of independent interest outside of this work. We apply this framework to the case of binary classification to establish a lower bound which matches the upper bound of our proposed approach up to a $\sqrt{K/q_{\min}}$-factor, where $K$ is the number of groups and $q_{\min}$ is the smallest group identity probability under the target distribution $q\,$. Our framework enables us to respect the information geometry of the problem via dependence on $D_{\chi^2}(q\mid\mid\overline{p})$ that matches the upper bound, which was not possible with existing techniques.}
}


@InProceedings{pmlr-v336-hartline26a,
  title = 	 {A Perfectly Truthful Calibration Measure},
  author =       {Hartline, Jason and Hu, Lunjia and Wu, Yifan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3185--3223},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hartline26a/hartline26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hartline26a.html},
  abstract = 	 {Calibration requires that predictions are conditionally unbiased and, therefore, reliably interpretable as probabilities. A calibration measure quantifies how far a predictor is from perfect calibration. A calibration measure is truthful if it is minimized in expectation when a predictor outputs the ground-truth probabilities. Predicting the true probabilities guarantees perfect calibration, but in reality, when calibration is evaluated on a random sample, all known calibration measures incentivize predictors to lie in order to appear more calibrated. This lack of truthfulness motivated approximately truthful calibration measures in the sequential prediction setting, but no perfectly truthful calibration measure was known to exist even in the more basic batch setting. We design a simple, perfectly and strictly truthful, sound, and complete calibration measure in the batch setting: Averaged Two-Bin Calibration Error (ATB). ATB is quadratically related to two existing calibration measures: the smooth calibration error and the lower distance to calibration. The simplicity of our definition of ATB makes it efficient and straightforward to compute, allowing us to give the first linear-time calibration testing algorithm. We also introduce a general recipe for constructing truthful measures based on the variance additivity of independent random variables, which proves the truthfulness of ATB as a special case and allows us to construct other truthful calibration measures, such as quantile-binned $\ell_2$ Expected Calibration Error (ECE).}
}


@InProceedings{pmlr-v336-holzman26a,
  title = 	 {Uniform Laws of Large Numbers in Product Spaces},
  author =       {Holzman, Ron and Moran, Shay and Shlimovich, Alexander},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3224--3279},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/holzman26a/holzman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/holzman26a.html},
  abstract = 	 {Uniform laws of large numbers form a cornerstone of Vapnik–Chervonenkis theory, where they are characterized by the finiteness of the VC dimension. In this work, we study uniform convergence phenomena in \emph{cartesian product spaces}, under assumptions on the underlying distribution that are compatible with the product structure. Specifically, we assume that the distribution is absolutely continuous with respect to the product of its marginals, a condition that captures many natural settings, including product distributions, sparse mixtures of product distributions, distributions with low mutual information, and more. We show that, under this assumption, a uniform law of large numbers holds for a family of events if and only if the \emph{linear VC dimension} of the family is finite. The linear VC dimension is defined as the maximum size of a shattered set that lies on an \emph{axis-parallel line}, namely, a set of vectors that agree on all but at most one coordinate. This dimension is always at most the classical VC dimension, yet it can be arbitrarily smaller. For instance, the family of convex sets in $\mathbb{R}^d$ has linear VC dimension $2$, while its VC dimension is infinite already for $d \ge 2$. Our proofs rely on an estimator that departs substantially from the standard empirical mean estimator and exhibits a more intricate structure. We show that such deviations from the standard empirical mean estimator are unavoidable in this setting.  Throughout the paper, we propose several open questions, with a particular focus on quantitative sample complexity bounds.}
}


@InProceedings{pmlr-v336-hou26a,
  title = 	 {Recovery thresholds for hidden weighted sparse graphs (extended abstract)},
  author =       {Hou, Zhe and Liu, Jingcheng},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3280--3284},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hou26a/hou26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hou26a.html},
  abstract = 	 {Recovering structural information from noisy high-dimensional data is a fundamental task in statistical inference. We investigate the information-theoretic recovery thresholds for a graph hidden in a randomly weighted complete graph. Specifically, an unknown graph $H^* \in \mathcal H_n$ is chosen uniformly at random, and hidden in a complete graph of $n$ vertices through edge weights: for an edge $e \in H^*$, its weight is distributed independently according to $\mathcal{P}_n$; otherwise the edge weight is distributed independently according to $\mathcal{Q}_n$. The goal is to recover the hidden edge set from these noisy observations. Our primary focus on \emph{almost exact recovery}, where all but a vanishing fraction of the edges of $H^*$ must be identified. By choosing $\mathcal{P} = \mathrm{Bern}(1), \mathcal{Q} = \mathrm{Bern}(q)$, this model captures the well-studied planted Erdős-Rényi recovery setting, and other weighted formulations such as Gaussian and Exponential distributions. Assuming a local Lipschitzness of the Rényi divergence between distributions $\mathcal{P}_n$ and $\mathcal{Q}_n$, and a mild density condition for the graphs $\mathcal H_n$, we give a unified characterization of the information-theoretic limit for recovering almost all of $H$ (also known as almost exact recovery). Our characterization uses the KL divergence $D_{\mathrm{KL}}(\mathcal{P}_n\|\mathcal{Q}_n)$ as the signal-to-noise metric. We show that there is a critical threshold $D_c$ governed by the logarithm of the first moment threshold of $\mathcal H_n$ in the Erdős-Rényi random graph model $G(n,p)$. For any constant $\eta$, if $D_{\mathrm{KL}} \ge (1+\eta)D_c$, the maximum likelihood estimator (MLE) achieves almost exact recovery; conversely if $D_{\mathrm{KL}} \le (1-\eta)D_c$, almost exact recovery is information-theoretically impossible. Our MLE analysis is based on an extension of (Jian Ding, Yihong Wu, Jiaming Xu, Dana Yang, 2020); While our lower bound involves deriving a distance-based Fano-type inequality based on Sibson’s $\alpha$-mutual information, and it also extends to the task of partial recovery, in which only a constant $\lambda$-fraction of the hidden graph is required to be recovered. As a result, if $D_{\mathrm{KL}} \le (\lambda-\eta)D_c$, we also proved that the recovery of any $\lambda$-fraction must fail with high probability. Another interesting phenomenon exhibited by some of these models is the “All-or-Nothing” (AoN), where one can either recover almost all of the hidden graph or essentially nothing at all. For several natural distributional families, including  certain Gaussian, Bernoulli and Exponential regimes, we establish an All-or-Nothing (AoN) threshold phenomenon at the exponential scale. For Gaussians, we obtained an AoN by lifting the sharp almost exact recovery threshold through the I-MMSE relation; For the others, we combine a second-moment argument inspired by planted subgraph recovery (Elchanan Mossel, Jonathan Niles-Weed, Youngtak Sohn, Nike Sun, Ilias Zadik, 2023) with the second-order Rényi divergence.  We also provide distributional examples where AoN is not universal at this level of generality, and our partial recovery lower bound is already the best possible.}
}


@InProceedings{pmlr-v336-hu26a,
  title = 	 {Near-optimal Swap Regret Minimization for Convex Losses},
  author =       {Hu, Lunjia and Schneider, Jon and Wu, Yifan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3285--3313},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hu26a/hu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hu26a.html},
  abstract = 	 {We give a randomized online algorithm that guarantees near-optimal $\widetilde{O}(\sqrt{T})$ expected swap regret against any sequence of $T$ adaptively chosen Lipschitz convex losses on the unit interval. This improves the previous best bound of $\widetilde{O}(T^{2/3})$ and answers an open question from prior work. In addition, our algorithm is efficient: it runs in polynomial time. A key technical idea we develop to obtain this result is to discretize the unit interval into bins at multiple scales of granularity and simultaneously use all scales to make randomized predictions, which we call multi-scale binning and may be of independent interest. A direct corollary of our result is an efficient online algorithm for minimizing the calibration error for general elicitable properties. This result does not require the Lipschitzness assumption of the identification function needed in prior work, making it applicable to median calibration, for which we achieve the first $\widetilde{O}(\sqrt{T})$ calibration error guarantee.}
}


@InProceedings{pmlr-v336-hu26b,
  title = 	 {Efficient Swap Multicalibration of Elicitable Properties},
  author =       {Hu, Lunjia and Luo, Haipeng and Senapati, Spandan and Sharan, Vatsal},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3314--3348},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hu26b/hu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hu26b.html},
  abstract = 	 {Multicalibration (Hébert-Johnson et al., 2018) is an algorithmic fairness perspective which demands that the predictions of a predictor are correct conditional on themselves and membership in a collection of potentially overlapping subgroups of a population. The work of Noarov and Roth (2023) established a surprising connection between multicalibration for an arbitrary property $\Gamma$ (e.g., mean or median) and property elicitation: a property $\Gamma$ can be multicalibrated if and only if it is elicitable, where elicitability is the notion that the true property value of a distribution can be obtained by solving a regression problem over the distribution. In the adversarial (online) setting, Noarov and Roth (2023) proposed an inefficient algorithm that achieves $\tilde{\mathcal{O}}(\sqrt{T})$ $\ell_2$-multicalibration error for a hypothesis class of group membership functions and an elicitable property $\Gamma$, after $T$ rounds of interaction between a forecaster and adversary. In this paper, we generalize multicalibration for an elicitable property $\Gamma$ from group membership functions to arbitrary bounded hypothesis classes and introduce a stronger notion—swap multicalibration, following Gopalan et al. (2023b). Subsequently, we propose an oracle-efficient algorithm which when given access to an online agnostic learner, achieves $\tilde{\mathcal{O}}(T^{\frac{1}{r+1}})$ $\ell_r$-swap multicalibration error with high probability ($r \ge 2$) for a hypothesis class with bounded sequential Rademacher complexity and an elicitable property $\Gamma$. For the special case of $r = 2$, this implies an oracle-efficient algorithm that achieves $\tilde{\mathcal{O}}(T^{\frac{1}{3}})$ $\ell_2$-swap multicalibration error, which significantly improves on the previously established bounds for the problem (Noarov and Roth, 2023; Ghuge et al., 2025; Luo et al., 2025a), and completely resolves an open question raised in Garg et al. (2024) on the possibility of an oracle-efficient algorithm that achieves $\tilde{\mathcal{O}}(\sqrt{T})$ $\ell_2$-mean multicalibration error by answering it in a strongly affirmative sense.}
}


@InProceedings{pmlr-v336-huang26a,
  title = 	 {Wasserstein Policy Learning for Distributional Outcomes},
  author =       {Huang, Yiyan and Leung, Cheuk Hang and Wu, Qi and Zhang, Zhiheng},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3349--3350},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/huang26a/huang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/huang26a.html},
  abstract = 	 {Offline policy learning has received growing attention in causal inference. The primary objective is to learn a policy (individualized treatment rule) as a mapping from covariates to treatment that maximizes the empirical welfare defined as the mean of scalar-valued potential outcomes. In this paper, we study offline policy learning with distribution-valued outcomes, where each potential outcome is a probability measure on $\mathbb{R}$ and the reward is defined through a utility functional applied to the Wasserstein barycenter of induced outcome distributions. We establish statistical guarantees for the policy learning framework based on both Inverse Probability Weighting (IPW) and Doubly Robust (DR) estimators. By handling the challenging uniform deviation over the product of the combinatorial policy class and the infinite-dimensional quantile domain, we prove that the finite-sample regret has leading dependence $\widetilde{\mathcal{O}}(\sqrt{\mathrm{N\text{-}dim}(\Pi)/N})$. In the one-dimensional Wasserstein setting and under the stated regularity conditions, the leading regret rate is still governed by the policy-class complexity. Moreover, we provide a minimax lower bound establishing the sharpness of the leading dependence on $N$ and $\mathrm{N\text{-}dim}(\Pi)$.}
}


@InProceedings{pmlr-v336-huang26b,
  title = 	 {Reconstructing {Riemannian} Metrics From Random Geometric Graphs},
  author =       {Huang, Han and Jiradilok, Pakawut and Mossel, Elchanan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3351--3440},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/huang26b/huang26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/huang26b.html},
  abstract = 	 {Random geometric graphs are random graph models defined on metric measure spaces. A random geometric graph is generated by first sampling points from a metric space and then connecting each pair of sampled points independently with a probability that depends on their distance. In recent work of Huang, Jiradilok, and Mossel, the authors study the problem of reconstructing an embedded manifold from a random geometric graph sampled from the manifold, where edge probabilities depend monotonically on the Euclidean distance between the embedded points. They show that, under mild regularity assumptions on the manifold, the sampling measure, and the connection probability function, it is possible to recover the pairwise Euclidean distances of the embedded sampled points up to a vanishing error as the number of vertices grows. In this work we consider a similar and arguably more natural problem where the metric is the Riemannian metric on the manifold. Again points are sampled from the manifold and a random graph is generated where the connection probability is monotone in the Riemannian distance. Perhaps surprisingly we obtain stronger results in this setup. Unlike the previous work that only considered dense graph we provide reconstruction algorithms from sparse graphs with average degree $n^{1/2}\mathrm{polylog}(n)$, where $n$ denotes the number of vertices. Our algorithm is also a more efficient algorithm for distance reconstruction with improved error bounds. The local distance-estimation part runs in $O(n^2\mathrm{polylog}(n))$ time, and the final all-pairs shortest-path step gives an overall running time of $O(n^3)$. Our distance error also nearly matches the volumetric lower bounds for distance estimation.}
}


@InProceedings{pmlr-v336-huang26c,
  title = 	 {Almost Linear Convergence under Minimal Score Assumptions: Quantized Transition Diffusion},
  author =       {Huang, Xunpeng and Lin, Yingyu and Kuang, Lijing and Dong, Hanze and Zou, Difan and Ma, Yian and Zhang, Tong},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3441--3487},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/huang26c/huang26c.pdf},
  url = 	 {https://proceedings.mlr.press/v336/huang26c.html},
  abstract = 	 {Continuous diffusion models have demonstrated remarkable generative performance across diverse domains but are often constrained by the computational cost of simulating reverse Ornstein–Uhlenbeck processes via SDE/ODE solvers.  Existing theoretical results typically establish query complexities that scale polynomially with both the dimension $d$ and the error tolerance $\epsilon$ (e.g., $\tilde{\mathcal{O}}(d/\epsilon)$).  This mirrors the limitations of unadjusted Langevin algorithm, where standard first-order score solvers lack access to zeroth-order density information, precluding natural error-correction mechanisms and thus preventing the fast $\ln(1/\epsilon)$ convergence attainable by Metropolis-adjusted methods. In this paper, we develop an improved generative modeling method by introducing Quantized Transition Diffusion (QTD), a framework that reformulates continuous diffusion into a discrete generation problem through spatial quantization and the parameterization of zeroth-order information (e.g., density ratios).  To sample from this discrete target, we propose a truncated uniformization algorithm that simulates the underlying continuous-time Markov chain of the discrete diffusion process without discretization error, while eliminating the restrictive bounded-score assumption required by prior uniformization-based approaches.  Consequently, QTD attains $\epsilon$-accuracy in total variation distance with a query complexity of $\mathcal{O}(d \ln^2(d/\epsilon))$, yielding a notable improvement in $\epsilon$-dependence compared to existing continuous diffusion samplers.  Crucially, our analysis capitalizes on a novel proof technique based on the infinitesimal chain rule of KL divergence, providing a fresh perspective on unifying continuous and discrete diffusion paradigms.}
}


@InProceedings{pmlr-v336-huleihel26a,
  title = 	 {Recovery of Planted Subgraphs},
  author =       {Huleihel, Wasim},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3488--3592},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/huleihel26a/huleihel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/huleihel26a.html},
  abstract = 	 {Understanding the fundamental limits of recovering planted subgraphs in random graphs is a central challenge in high-dimensional statistics and theoretical computer science. While existing work has largely focused on special subgraph families such as cliques, bicliques, or dense blocks, the exact recovery of a general planted subgraph in Erdős–Rényi random graphs remains poorly understood. In this paper, we study the exact recovery of an arbitrary planted subgraph $\Gamma = \Gamma_n$ embedded in a dense Erdős–Rényi random graph $\mathcal{G}(n,q_n)$, where edges within $\Gamma$ are present independently with probability $p_n > q_n$. Our main results identify sharp conditions under which exact recovery is possible with high probability, and we establish matching lower bounds showing the necessity of these conditions. The resulting statistical threshold is characterized by a new graph-theoretic quantity, which we term the \emph{minimal maximum subgraph density}. This quantity is defined as the maximum subgraph density of the smallest induced balanced subgraph of $\Gamma$. We then turn to the problem of recovery under polynomial-time constraints. We propose a computationally efficient recovery algorithm that applies to arbitrary planted subgraphs and analyze its performance in terms of certain spectral properties of the adjacency matrix. In addition, we derive computational lower bounds for recovery using the low-degree polynomial framework, establishing regimes where recovery is statistically possible but computationally hard. Finally, we consider several extensions of our setting, including recovery in semi-random models and weaker notions of recovery.}
}


@InProceedings{pmlr-v336-hu26c,
  title = 	 {Simultaneous Blackwell Approachability and Applications to Multiclass Omniprediction},
  author =       {Hu, Lunjia and Tian, Kevin and Yang, Chutong},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3593--3634},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hu26c/hu26c.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hu26c.html},
  abstract = 	 {Omniprediction is a learning problem that requires suboptimality bounds for each of a family of losses $\mathcal{L}$ against a family of comparator predictors $\mathcal{C}$. We initiate the study of omniprediction in a multiclass setting, where the comparator family $\mathcal{C}$ may be infinite. Our main result is an extension of the recent binary omniprediction algorithm of Okoroafor et al. (2025) to the multiclass setting, with sample complexity (in statistical settings) or regret horizon (in online settings) $\approx \varepsilon^{-(k+1)}$, for $\varepsilon$-omniprediction in a $k$-class prediction problem. En route to proving this result, we design a framework of potential broader interest for solving Blackwell approachability problems where multiple sets must simultaneously be approached via coupled actions.}
}


@InProceedings{pmlr-v336-hutton26a,
  title = 	 {On Randomized Algorithms in Online Strategic Classification},
  author =       {Hutton, Chase and Melrod, Adam and Shao, Han},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3635--3665},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/hutton26a/hutton26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/hutton26a.html},
  abstract = 	 {Online strategic classification studies settings in which agents strategically modify their features to obtain favorable predictions. For example, given a classifier that determines loan approval based on credit scores, applicants may open or close credit cards and bank accounts to obtain a positive prediction. The learning goal is to achieve low mistake or regret bounds despite such strategic behavior. While randomized algorithms have the potential to offer advantages to the learner in strategic settings, they have been largely underexplored. In the realizable setting, no lower bound is known for randomized algorithms, and existing lower bound constructions for deterministic learners can be circumvented by randomization. In the agnostic setting, the best known regret upper bound is $O(T^{3/4}\log^{1/4}(T|\mathcal{H}|))$ due to Ahmadi et al. (2023), which is far from the standard online learning rate of $O(\sqrt{T\log|\mathcal{H}|})$. In this work, we provide refined upper and lower bounds for online strategic classification in both the realizable and agnostic settings; our bounds depend on the Littlestone dimension $\mathrm{Ldim}(\mathcal{H})$ of the hypothesis class $\mathcal{H}$ and the maximum degree $\Delta$ of the manipulation graph. In the realizable setting, using a new construction, we prove a lower bound that, for $T > \mathrm{Ldim}(\mathcal{H}) \Delta^2$, extends the existing deterministic lower bound of $\Omega(\mathrm{Ldim}(\mathcal{H}) \Delta)$ to all algorithms. This is the first lower bound that applies to randomized algorithms, resolving an open question of Ahmadi et al. (2023). We also give the first randomized algorithm that improves on the known deterministic upper bound of $O(\mathrm{Ldim}(\mathcal{H})\cdot\Delta\log\Delta)$, achieving $O(\sqrt{T\cdot\mathrm{Ldim}(\mathcal{H})\log\Delta})$ expected mistakes, in the regime $T < \mathrm{Ldim}(\mathcal{H})\,\Delta^2\log\Delta$. In the agnostic setting, we give an improper randomized algorithm with expected regret $O(\sqrt{T\log|\mathcal{H}|})$ against adaptive adversaries, improving upon the previous $O(T^{3/4}\log^{1/4}(T|\mathcal{H}|))$ bound and matching the standard online learning rate. We also prove that this optimal rate requires improper learning.}
}


@InProceedings{pmlr-v336-ito26a,
  title = 	 {Adversarial Learning in Games with Bandit Feedback: {{L}}ogarithmic Pure-Strategy Maximin Regret},
  author =       {Ito, Shinji and Luo, Haipeng and Maiti, Arnab and Tsuchiya, Taira and Wu, Yue},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3666--3692},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ito26a/ito26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ito26a.html},
  abstract = 	 {Learning to play zero-sum games is a fundamental problem in game theory and machine learning. While significant progress has been made in minimizing external regret in the self-play settings or with full-information feedback, real-world applications often force learners to play against unknown, arbitrary opponents and restrict learners to bandit feedback where only the reward of the realized action is observable. In such challenging settings, it is well-known that $\Omega(\sqrt T)$ external regret is unavoidable (where $T$ is the number of rounds). To overcome this barrier, we investigate adversarial learning in zero-sum games under bandit feedback, aiming to minimize the deficit against the maximin pure strategy — a metric we term Pure-Strategy Maximin Regret. We analyze this problem under two bandit feedback models: \emph{uninformed} (only the realized reward is revealed) and \emph{informed} (both the reward and the opponent’s action are revealed). For uninformed bandit learning of normal-form games, we show that the Tsallis-INF algorithm achieves $\mathcal{O}(c \log T)$ instance-dependent regret with a game-dependent parameter $c$. Crucially, we prove an information-theoretic lower bound showing that the dependence on $c$ is necessary. To overcome this hardness, we turn to the informed setting and introduce Maximin-UCB, which obtains another regret bound of the form $\mathcal{O}(c’ \log T)$ for a different game-dependent parameter $c’$ that could potentially be much smaller than $c$. Finally, we generalize both results to bilinear games over an arbitrary, large action set, proposing Tsallis-FTRL-SPM and Maximin-LinUCB for the uninformed and informed settings respectively and establishing similar game-dependent logarithmic regret bounds.}
}


@InProceedings{pmlr-v336-iverson26a,
  title = 	 {On the Importance of Randomization in Discriminative Feature Feedback},
  author =       {Iverson, Valentio and Lechner, Tosca and Sabato, Sivan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3693--3715},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/iverson26a/iverson26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/iverson26a.html},
  abstract = 	 {Discriminative Feature Feedback (DFF) (Dasgupta et al., 2018) is an interactive learning protocol in which a learner attempts to predict labels based on online feedback about previous correct labels, as well as discriminative features. Recent work (Bar Oz et al., 2025) studied DFF learning with general teacher classes and defined a dimension that characterizes the optimal mistake bound for deterministic algorithms in the realizable setting. In this work, we show that in sharp contrast to Online Learning, in DFF there can be an unbounded ratio between the optimal mistake bound of deterministic algorithms and that of randomized algorithms, even in the realizable setting. We further show that in this case, also non-realizable learning can have a mistake bound that does not depend on the dimension at all. This result relies on a new algorithmic technique that allows introducing new candidate hypotheses incrementally and could be of independent interest. We further show that in DFF, there can be a significant difference between the obtainable mistake bounds against an oblivious adversary and against an adaptive adversary, again in contrast to Online Learning. Our work shows that once richer feedback than labels is allowed, the landscape of randomized versus deterministic algorithms becomes significantly more involved, and raises new questions on characterizing the optimal mistake bound under differing randomization regimes. }
}


@InProceedings{pmlr-v336-janz26a,
  title = 	 {Sharp analysis of linear ensemble sampling},
  author =       {Janz, David and Akhavan, Arya and Szepesv{\'a}ri, Csaba},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3716--3750},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/janz26a/janz26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/janz26a.html},
  abstract = 	 {We analyse linear ensemble sampling (ES) with standard Gaussian perturbations in stochastic linear bandits. We show that for ensemble size $m=\Theta(d\log n)$, ES attains $\tilde O(d^{3/2}\sqrt n)$ high-probability regret, closing the gap to the Thompson sampling benchmark while keeping computation comparable. The proof brings a new perspective on randomized exploration in linear bandits by reducing the analysis to a time-uniform exceedance problem for $m$ independent Brownian motions. This continuous-time lens appears particularly natural here: it yields an exact representation of the relevant discrete-time processes, and we do not know another route to a sharp ES bound.}
}


@InProceedings{pmlr-v336-jia26a,
  title = 	 {Low-Degree Method Fails to Predict Robust Subspace Recovery},
  author =       {Jia, He and Vijayaraghavan, Aravindan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3751--3781},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/jia26a/jia26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/jia26a.html},
  abstract = 	 {The low-degree polynomial framework has been highly successful in predicting computational versus statistical gaps for high-dimensional problems in average-case analysis and machine learning. This success has led to the low-degree conjecture, which posits that this method captures the power and limitations of efficient algorithms for a wide class of high-dimensional statistical problems. We identify a natural and basic hypothesis testing problem in $\mathbb{R}^n$ which is polynomial time solvable, but for which the low-degree polynomial method fails to predict its computational tractability even up to degree $k=n^{\Omega(1)}$. Moreover, the low-degree moments match exactly up to degree $k=O(\sqrt{\log n/\log\log n})$. Our problem is a special case of the well-studied robust subspace recovery problem. The lower bounds suggest that there is no polynomial time algorithm for this problem. In contrast, we give a simple and robust polynomial time algorithm that solves the problem (and noisy variants of it), leveraging anti-concentration properties of the distribution. Our results suggest that the low-degree method and low-degree moments fail to capture algorithms based on anti-concentration, challenging their universality as a predictor of computational barriers.}
}


@InProceedings{pmlr-v336-jiang26a,
  title = 	 {Adaptive Matrix Online Learning through Smoothing with Guarantees for Nonsmooth Nonconvex Optimization},
  author =       {Jiang, Ruichen and Mhammedi, Zakaria and Mohri, Mehryar and Mokhtari, Aryan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3782--3824},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/jiang26a/jiang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/jiang26a.html},
  abstract = 	 {We study online linear optimization with matrix variables constrained by the operator norm, a setting where the geometry renders designing data-dependent and efficient adaptive algorithms challenging. The best-known adaptive regret bounds are achieved by Shampoo-like methods, but they require solving a costly quadratic projection subproblem. To address this, we extend the gradient-based prediction scheme to adaptive matrix online learning and cast algorithm design as constructing a family of smoothed potentials for the nuclear norm. We define a notion of admissibility for such smoothings and prove any admissible smoothing yields a regret bound matching the best-known guarantees of one-sided Shampoo. We instantiate this framework with two efficient methods that avoid quadratic projections. The first is an adaptive Follow-the-Perturbed-Leader (FTPL) method using Gaussian stochastic smoothing. The second is Follow-the-Augmented-Matrix-Leader (FAML), which uses a deterministic hyperbolic smoothing in an augmented matrix space. By analyzing the admissibility of these smoothings, we show both methods admit closed-form updates and match one-sided Shampoo’s regret up to a constant factor, while significantly reducing computational cost. Lastly, using the online-to-nonconvex conversion, we derive two matrix-based optimizers, Pion (from FTPL) and Leon (from FAML). We prove convergence guarantees for these methods in nonsmooth nonconvex settings, a guarantee that the popular Muon optimizer lacks.}
}


@InProceedings{pmlr-v336-jin26a,
  title = 	 {Avoiding exp($k^*$) Scaling for Thompson Sampling in Combinatorial Semi-Bandits: From Multiple Seeds to a Single Seed},
  author =       {Jin, Tianyuan and Zhao, Heyang and Tan, Vincent Y. F. and Gu, Quanquan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3825--3855},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/jin26a/jin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/jin26a.html},
  abstract = 	 {The Combinatorial Multi-Armed Bandit (CMAB) framework extends classical multi-armed bandit theory to complex decision-making settings where agents select super arms to maximize a collective reward. While Thompson Sampling (TS) is widely favored for its robust empirical performance in these settings, its theoretical guarantees have historically suffered from a significant bottleneck: standard Combinatorial Thompson Sampling (\texttt{CTS}) incurs a regret bound with an exponential dependence on the size $k^*$ of the optimal super-arm. This exponential term arises because standard independent posterior sampling fails to coordinate optimism across the base arms of the optimal super arm, causing the probability of exploration to vanish as $k^*$ increases. Although recent advances have achieved polynomial regret for \emph{linear} rewards, designing an efficient TS algorithm for general, non-linear CMABs remains an open challenge. In this paper, we resolve this open question by proposing \emph{Combinatorial Thompson Sampling with a Single Seed} (\texttt{CTS$^3$}). Unlike standard approaches that sample base arms independently, \texttt{CTS$^3$} employs a comonotonic coupling strategy: it generates parameters for all base arms using a single shared random seed via the inverse CDF transform. This mechanism synchronizes sampling fluctuations across arms, ensuring concerted optimism and preventing the exploration probability from decaying exponentially. We prove that \texttt{CTS$^3$} achieves a regret bound of ${O}\left( \frac{m kk^*B^2}{\Delta_{\min}}\poly(\log(T,m,\Delta_{\max}/\Delta_{\min}))\right)$ for general reward functions satisfying monotonicity and bounded smoothness, where $m$ is the number of total base arms, $k$ is the largest super arm size, and $k^*$ is size of the optimal arm. To the best of our knowledge, this is the first polynomial regret bound for Thompson Sampling in general CMAB settings. Empirical evaluations confirm that \texttt{CTS$^3$} significantly outperforms standard independent TS, particularly in regimes with large super arms.}
}


@InProceedings{pmlr-v336-joseph26a,
  title = 	 {Ripple Mechanisms for Discrete and Private Statistics},
  author =       {Joseph, Matthew and Kulesza, Alex and Wang, Yuyan and Yu, Alexander},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3856--3903},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/joseph26a/joseph26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/joseph26a.html},
  abstract = 	 {We study \emph{ripple mechanisms} for pure differentially private computation of discrete statistics. For each of three natural statistics – sum, count, and vote – we construct an efficient instance of the ripple mechanism and show that it is often more accurate than the previous state of the art. We also prove that ripple mechanisms are, in some settings, optimal among all discrete pure differentially private additive noise mechanisms.}
}


@InProceedings{pmlr-v336-kalavasis26a,
  title = 	 {Can SGD Select Good Fishermen? Local Convergence under Self-Selection Biases (Extended Abstract)},
  author =       {Kalavasis, Alkis and Mehrotra, Anay and Zhou, Felix},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3904--3905},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/kalavasis26a/kalavasis26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/kalavasis26a.html},
  abstract = 	 {We revisit the problem of estimating $k$ linear regressors in $d$ dimensions from samples affected by self-selection bias under the maximum selection rule. Our main result is an algorithm with sample complexity $O(d)\cdot \operatorname{poly}(k,1/\varepsilon)$ and running time $\operatorname{poly}(d,k,1/\varepsilon)+(k\log k)^{O(k)}$ for recovering the regressors up to joint squared error $\varepsilon^2$. The key ingredient is the first local-convergence algorithm for the maximum self-selection model. Our approach reduces self-selection to estimation from coarse observations, where the learner observes only the cell of a partition containing the latent sample. The self-selection reduction induces a structured non-convex partition. We prove that this partition preserves enough information locally and that the resulting negative log-likelihood is locally convex around the true parameters. These two geometric properties allow projected stochastic gradient descent, initialized from an existing warm start, to obtain the stated end-to-end guarantee.}
}


@InProceedings{pmlr-v336-karmarkar26a,
  title = 	 {Fast, Parallel, Query-Efficient Binary Classification},
  author =       {Karmarkar, Ishani and O'Carroll, Liam and Sidford, Aaron},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3906--3949},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/karmarkar26a/karmarkar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/karmarkar26a.html},
  abstract = 	 {We study the fundamental classification problem of computing a separating hyperplane for a binary-labeled dataset of size $n$ with normalized $d$-dimensional features. Letting $\Phi \in \mathbb{R}^{n \times d}$ denote the feature matrix and $\gamma$ the margin of the maximum-margin separating hyperplane, we present a randomized algorithm that solves this problem in $\tilde{O}(\gamma^{-2/3}\, \operatorname{nnz}(\Phi) + \gamma^{-2(\omega+1)/3})$-sequential running time (work), $\tilde{O}(\gamma^{-2/3})$-parallel (computational) depth, and accesses $\Phi$ only through $\tilde{O}(\gamma^{-2/3})$-matrix-vector queries (matvecs). We also present a second, faster randomized algorithm with a $\tilde{O}(\gamma^{-2/3}\, \operatorname{nnz}(\Phi) + \gamma^{-2})$-sequential running time that uses $\tilde{O}(\gamma^{-2/3})$-matvecs to $\Phi$, but achieves only $\tilde{O}(\gamma^{-4/3})$-parallel depth. Both algorithms match the near-optimal deterministic matvec complexity recently established by Kornowski and Shamir [2025], Karmarkar et al. [2026] and achieve improved sequential runtime and parallel depth, albeit at the expense of using randomness.}
}


@InProceedings{pmlr-v336-kattermann26a,
  title = 	 {Recursively {Enumerably} {Representable} {Classes} and {Computable} {Versions} of the {Fundamental Theorem} of {Statistical Learning}},
  author =       {Kattermann, David and Krapp, Lothar Sebastian},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3950--3969},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/kattermann26a/kattermann26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/kattermann26a.html},
  abstract = 	 {We study computable probably approximately correct (CPAC) learning, where learners are required to be computable functions. It had been previously observed that the Fundamental Theorem of Sta- tistical Learning, which characterizes PAC learnability by finiteness of the Vapnik-Chervonenkis (VC-)dimension, no longer holds in this framework. Recent works recovered analogs of the Funda- mental Theorem in the computable setting, for instance by introducing an effective VC-dimension. In this work, we investigate the relationship between CPAC learning and recursively enumerable representable (RER) classes, hypothesis classes whose members can be algorithmically listed, in the context of the Fundamental Theorem. We demonstrate that the RER property is deeply con- nected to CPAC learning by characterizing several notions of CPAC learnability via the existence of certain RER classes realizing the same samples. We further establish that the RER property alone is sufficient to guarantee nonuniform CPAC learnability and give a sufficient condition for CPAC learnable classes to be RER. Other results show that the effective VC-dimension can take arbitrary values above the traditional one and we note that the two dimensions coincide given the existence of a computable empirical risk minimizer. This recovers classical PAC bounds for most practically relevant classes and establishes a family of examples separating several notions of learnability.}
}


@InProceedings{pmlr-v336-khanna26a,
  title = 	 {Spectral Valleys and Sharp Failures in Greedy Determinant Maximization},
  author =       {Khanna, Rajiv},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3970--3992},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/khanna26a/khanna26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/khanna26a.html},
  abstract = 	 {The classic greedy algorithm is a widely used method for determinant maximization. Although worst-case theory predicts exponentially poor performance, greedy methods are often observed to perform substantially better in practice. This work explains this discrepancy through a finer spectrum-dependent analysis of the greedy algorithm. Specifically, we develop a sharp spectrum-dependent characterization of the greedy vs optimal determinant gap by analyzing greedy selection over structured spectral landscapes. Our main result is an upper bound that decomposes this gap in terms of stable-rank windows. When the target cardinality lies within a wide spectral valley, greedy admits guarantees exponentially stronger than the classical bound; when such valleys vanish due to sharp spectral drops, greedy necessarily encounters failure cliffs, matching known worst-case constructions. This yields a spectral-landscape-dependent characterization that explains sharp regime changes in greedy performance as the target cardinality varies for the input matrix. Finally, we show that several practical statistical models—including isotropic random features, near-identity kernels, and spiked-plus-noise spectra—provably induce these spectral success valleys, yielding strictly stronger guarantees than the worst-case theory. Together, our results provide a tight, beyond-worst-case understanding of greedy determinant maximization.}
}


@InProceedings{pmlr-v336-klivans26a,
  title = 	 {Sandwiching Polynomials for Geometric Concepts with Low Intrinsic Dimension},
  author =       {Klivans, Adam and Stavropoulos, Konstantinos and Vasilyan, Arsen},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {3993--4021},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/klivans26a/klivans26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/klivans26a.html},
  abstract = 	 { Recent work has shown the surprising power of low-degree {\em sandwiching} polynomial approximators in the context of challenging learning settings such as learning with distribution shift, testable learning, and learning with contamination.  A pair of sandwiching polynomials approximate a target function in expectation while also providing \emph{pointwise} upper and lower bounds on the function’s values.  In this paper, we give a new method for constructing low-degree sandwiching polynomials that yield greatly improved degree bounds for several fundamental function classes and marginal distributions. In particular, we obtain degree $\mathrm{poly}(k)$ sandwiching polynomials for functions of $k$ halfspaces under the Gaussian distribution, improving exponentially over the prior $2^{O(k)}$ bound.  More broadly, our approach applies to function classes that are low-dimensional and have smooth boundary. In contrast to prior work, our proof is relatively simple and directly uses the smoothness of the target function’s boundary to construct sandwiching Lipschitz functions, which are amenable to results from high-dimensional approximation theory.  For low-dimensional polynomial threshold functions (PTFs) with respect to Gaussians, we obtain doubly exponential improvements without applying the FT-mollification method of Kane used in the best previous result.}
}


@InProceedings{pmlr-v336-patel26a,
  title = 	 {Equivalence of Coarse and Fine-Grained Models for Learning with Distribution Shift},
  author =       {Patel, Shyamal and Klivans, Adam and Stavropoulos, Konstantinos and Vasilyan, Arsen},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4022--4049},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/patel26a/patel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/patel26a.html},
  abstract = 	 {Recent work on provably efficient algorithms for learning with distribution shift has focused on two models: PQ learning (Goldwasser et al., 2020) and TDS learning (Klivans et al., 2024).  Algorithms for TDS learning are allowed to reject a test set entirely if distribution shift is detected.  In contrast, PQ learners may only reject points that are deemed out-of-distribution on an individual basis.  Our main result is a surprising equivalence between these two models in the distribution-free setting.  In particular, we give an efficient black-box reduction from PQ learning to TDS learning for any Boolean concept class.  This equivalence implies the first hardness results for distribution-free TDS learning of basic concept classes such as halfspaces.  The main technical contribution underlying our equivalence is a method for boosting, via branching programs,  the weak distinguishing power of TDS learners that have rejected the target domain.  We also show that giving a learner access to {\em membership queries} sidesteps these hardness results and allows for efficient, distribution-free PQ learnability of halfspaces.  Our algorithm iteratively recovers large-margin separators obtained by applying successive Forster transforms on the training data.  }
}


@InProceedings{pmlr-v336-koehler26a,
  title = 	 {Overlap Analysis of the Shortest Path Problem: Local Search, Landscapes, and Franz-Parisi Potential},
  author =       {Koehler, Frederic and Shin, Joonhyung},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4050--4228},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/koehler26a/koehler26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/koehler26a.html},
  abstract = 	 {Two directions in algorithms and complexity involve: (1) classifying which optimization problems can be solved in polynomial time, and (2) understanding which computational problems are hard to solve \emph{on average} in addition to the worst case. For many average-case problems, there does not currently exist strong evidence via reductions that they are hard. However, we can still attempt to predict their polynomial time tractability by proving lower bounds against restricted classes of algorithms. Geometric approaches to predicting tractability typically study the \emph{optimization landscape}. For optimization problems with random objectives or constraints, ideas originating in statistical physics suggest we should study the \emph{overlap} between approximately-optimal solutions. Formally, properties of \emph{Gibbs measures} and the \emph{Franz–Parisi potential} imply lower bounds against natural local search algorithms, such as Langevin dynamics. A related theory, the \emph{Overlap Gap Property (OGP)}, proves rigorous lower bounds against classes of algorithms which are stable functions of their input. A remarkable recent work of Li and Schramm [COLT 2025] showed that the shortest path problem in random graphs, which is polynomial-time tractable, admits lower bounds against a class of stable algorithms via the OGP. We further investigate and find that: (1) via the OGP and FPP, we can show stable algorithms and natural MCMC chains fail in the optimization landscape of shortest paths, but (2) stable algorithms and local search succeed in the optimization landscape for shortest path \emph{trees}, which agrees with OGP and FPP predictions.}
}


@InProceedings{pmlr-v336-kosoy26a,
  title = 	 {Ambiguous Online Learning},
  author =       {Kosoy, Vanessa},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4229--4266},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/kosoy26a/kosoy26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/kosoy26a.html},
  abstract = 	 {We propose a new variant of online learning that we call “ambiguous online learning". In this setting, the learner is allowed to produce multiple predicted labels. Such an “ambiguous prediction" is considered correct when at least one of the labels is correct, and none of the labels are “predictably wrong". The definition of “predictably wrong" comes from a hypothesis class in which hypotheses are also multi-valued. Thus, a prediction is “predictably wrong" if it’s not allowed by the (unknown) true hypothesis. In particular, this setting is natural in the context of multivalued dynamical systems, recommendation algorithms and lossless compression. It is also strongly related to so-called “apple tasting". We show that in this setting, the asymptotic minimax mistake bound is controlled by a combination of the classical Littlestone dimension $\mathrm{L}$ and a new parameter that we call “ambiguous Littlestone dimension" (denoted $\mathrm{AL}$). There is a trichotomy of behaviors: up to logarithmic factors, any hypothesis class has a mistake bound of either $O(1)$ (when both $\mathrm{AL}$ and $\mathrm{L}$ are finite), $\tilde{\Theta}(\sqrt{N})$ (when $\mathrm{AL}$ is infinite but $\mathrm{L}$ is finite) or $\Theta(N)$ (when both are infinite).}
}


@InProceedings{pmlr-v336-kreisler26a,
  title = 	 {Clipping the Price of Adaptivity at the Tail},
  author =       {Kreisler, Itai and Carmon, Yair and Hinder, Oliver},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4267--4307},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/kreisler26a/kreisler26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/kreisler26a.html},
  abstract = 	 {Adaptive stochastic convex optimization (SCO) methods face a fundamental “price of adaptivity” barrier: under the standard set of assumptions, they cannot efficiently adapt to large uncertainty in both the initial distance to optimality and the Lipschitz constant. We circumvent this barrier by requiring a small amount of additional structure common to many learning problems. Specifically, we assume that the objective decomposes into a model and a loss function, enabling us to intervene by modifying the model’s output before it passes to the loss function. Under this assumption, we design a method that clips the learned model output in tail events where it deviates too much from the output of a fixed reference model. Our method matches the optimal bounds for known-parameter SCO up to logarithmic factors in the uncertainty in the distance and Lipschitz parameters, thus efficiently adapting to large uncertainty in both.}
}


@InProceedings{pmlr-v336-kumar26a,
  title = 	 {A Distribution Testing Approach to Clustering Distributions},
  author =       {Kumar, Gunjan and Pote, Yash and Scarlett, Jonathan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4308--4348},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/kumar26a/kumar26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/kumar26a.html},
  abstract = 	 {We study the following distribution clustering problem: Given a hidden partition of $k$ distributions into $2$ groups, such that the distributions within each group are the same, and the distributions associated with the clusters are pairwise $\varepsilon$-far in total variation, the goal is to recover the partition. We establish upper and lower bounds on the sample complexity for two fundamental cases: (1) when one of the cluster’s distributions is known, and (2) when both are unknown. Our upper and lower bounds characterize the sample complexity’s dependence on the domain size $n$, number of distributions $k$, size $r$ of one of the clusters, and distance $\varepsilon$. In particular, we achieve tightness with respect to $(n,k,r,\varepsilon)$ (up to an $O(\log k)$ factor) for all regimes.  In addition, we show that this result extends to the case of $d$-clustering for any constant number of clusters $d$.}
}


@InProceedings{pmlr-v336-kumar26b,
  title = 	 {On the Curse of Dimensionality in Private Sparse Covariance Estimation and {PCA}},
  author =       {Kumar, Syamantak and Pandey, Shourya and Sarkar, Purnamrita and Tian, Kevin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4349--4400},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/kumar26b/kumar26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/kumar26b.html},
  abstract = 	 {We study high-dimensional differentially private (DP) covariance estimation in the operator norm, and principal component analysis (PCA), under $k$-row-column sparsity ($k$-RCS) of the covariance matrix. In the non-private setting, it is known that $\operatorname{poly}(k,\log d)$ samples suffice to solve both of these problems. However, the only comparable result known under DP (Wang and Xu, 2021) requires $\Omega(d)$ samples under standard parameterizations of the problem. We investigate when this curse of dimensionality is inherent for sparse covariance estimation tasks under DP. On the upper bound front, we show that a $\operatorname{poly}(k,\log d)$ sample complexity for PCA is possible under DP, if we also posit sparsity of the leading eigenvector. We complement this result with $\operatorname{poly}(d)$ lower bounds under DP for both sparse covariance estimation and PCA, establishing an exponential gap between the private and non-private variants of these problems when $k=\operatorname{polylog}(d)$. To our knowledge, no such separation has previously been demonstrated for any sparse estimation problem in private high-dimensional statistics. Our techniques are flexible enough that they imply stronger lower bounds even for the well-studied problem of standard DP PCA, without sparsity assumptions.}
}


@InProceedings{pmlr-v336-lai26a,
  title = 	 {How Does the ReLU Activation Affect the Implicit Bias of Gradient Descent on High-dimensional Neural Network Regression?},
  author =       {Lai, Kuo-Wei and Wang, Guanghui and Tao, Molei and Muthukumar, Vidya},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4401--4477},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/lai26a/lai26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/lai26a.html},
  abstract = 	 {Overparameterized ML models, including neural networks, typically induce underdetermined training objectives with multiple global minima. The implicit bias refers to the limiting global minimum that is attained by a common optimization algorithm, such as gradient descent (GD). In this paper, we characterize the implicit bias of GD for training a shallow ReLU model with the squared loss on high-dimensional random features. Prior work (Vardi and Shamir, 2021) showed that the implicit bias does not exist in the worst-case, or corresponds exactly to the minimum-$\ell_2$-norm interpolating solution under exactly orthogonal data (Boursier et al., 2022). Our work interpolates between these two extremes and shows that, for sufficiently high-dimensional random data, the implicit bias approximates the minimum-$\ell_2$-norm solution with high probability with a gap on the order $\Theta(\sqrt{n/||\lambda||_1})$, where $n$ is the number of training examples and $\lambda$ denotes the spectrum of the data covariance matrix. Our results are obtained through a novel primal-dual analysis that carefully tracks the evolution of predictions, data-span coefficients, as well as their interactions, and show that the ReLU activation pattern quickly stabilizes with high probability over random data.}
}


@InProceedings{pmlr-v336-lee26a,
  title = 	 {Adaptive Learning Rates with Surrogate Probability for Follow-the-Perturbed-Leader},
  author =       {Lee, Jongyeong and Honda, Junya and Ito, Shinji and Kim, Chansoo},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4478--4519},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/lee26a/lee26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/lee26a.html},
  abstract = 	 {Follow-the-regularized-leader framework has shown effectiveness and flexibility in online learning problems, where the choice of learning rates are known to be crucial. Recently, adaptive learning rates defined in terms of the arm-selection probabilities, obtained by solving convex optimization, have achieved improved best-of-both-worlds (BOBW) guarantees in various bandit problems. In contrast, BOBW guarantees for its computationally efficient alternative, follow-the-perturbed-leader (FTPL), remain relatively limited since its optimization-free nature ironically makes the design of adaptive, probability-dependent learning rates non-trivial. To address this challenge, we propose an adaptive learning rate for FTPL by introducing surrogate probability functions that can be computed only from the available quantities, without requiring the exact probabilities. Based on these learning rates with surrogate functions, we provide the BOBW guarantee for FTPL with Pareto perturbations for any shape parameter $\alpha >1$, generalizing prior results restricted to specific choices of $\alpha=2$. We further show the BOBW guarantees for FTPL with adaptive learning rates in the bandit problem with expert advices. Our approach preserves the computational simplicity of FTPL while enabling probability-dependent adaptivity, and the surrogate-based methodology may be of independent interest in other algorithmic frameworks beyond FTPL and learning rate designs.}
}


@InProceedings{pmlr-v336-lee26b,
  title = 	 {Unified Framework of Distributional Regret in Multi-Armed Bandits and Reinforcement Learning},
  author =       {Lee, Harin and Oh, Min{-}hwan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4520--4584},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/lee26b/lee26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/lee26b.html},
  abstract = 	 {We study the distribution of regret in stochastic multi-armed bandits and episodic reinforcement learning through a unified framework. We formalize a \emph{distributional regret bound} as a probabilistic guarantee that holds \emph{uniformly} over all confidence levels $\delta \in (0,1]$, thereby characterizing the regret distribution across the full range of $\delta$. We present a simple UCBVI-style algorithm with exploration bonus $\min{c_{1,k}/N, c_{2,k}/\sqrt{N}}$, where $N$ denotes the visit count and $(c_{1,k},c_{2,k})$ are user-specified parameters. For arbitrary parameter sequences, we derive general gap-independent and gap-dependent distributional regret bounds, yielding a principled characterization of how the parameters control the trade-off between expected performance, tail risk, and instance-dependent behavior. In particular, our bounds achieve optimal trade-offs between expected and distributional regret in both minimax and instance-dependent regimes. As a special case, for multi-armed bandits with $A$ arms and horizon $T$, we obtain a distributional regret bound of order $\mathcal{O}\big(\sqrt{AT}\log(1/\delta)\big)$, confirming the conjecture of Lattimore and Szepesvári (2020, Section 17.1) for the first time.}
}


@InProceedings{pmlr-v336-lee26c,
  title = 	 {Blackwell Approachability and Gradient Equilibrium are Equivalent},
  author =       {Lee, Brian W. and Haghtalab, Nika and Jordan, Michael I. and Tibshirani, Ryan J.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4585--4587},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/lee26c/lee26c.pdf},
  url = 	 {https://proceedings.mlr.press/v336/lee26c.html},
  abstract = 	 {Gradient equilibrium (GEQ) is a recently introduced online optimization framework that generalizes first-order stationarity from offline optimization, and abstracts problems like online conformal prediction. While GEQ has curious similarities with known online learning frameworks, such as regret minimization, prior work has shown that GEQ error and regret are incomparable as objectives, leaving open a precise understanding of how GEQ fits into the broader online learning landscape. In this work, we show that GEQ is equivalent to Blackwell approachability in the algorithmic sense. That is, a Blackwell approachability problem can always be solved using queries to a black-box GEQ oracle, with no asymptotic loss in the oracle’s error rate, and vice versa. Taken together with known equivalences between approachability, regret minimization, and calibration, these results imply an equivalence between GEQ and these frameworks, as well. Hence, while GEQ guarantees are semantically different from known online learning guarantees, GEQ algorithms are equally powerful primitives as classical regret minimization and calibration algorithms. Our reductions are efficient and can be used to transfer refined guarantees, such as optimism and strong adaptivity, from regret minimization to GEQ. Our techniques can also be used to identify necessary and sufficient conditions for GEQ, and to establish reductions between different notions of GEQ with unconstrained and constrained decision sets.}
}


@InProceedings{pmlr-v336-lee26d,
  title = 	 {A Single Stepsize Suffices for Unprojected Linear {TD(0)}: Simultaneous Robust and Fast Rates via Polyak–Ruppert Averaging},
  author =       {Lee, Wei-Cheng and Orabona, Francesco},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4588--4634},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/lee26d/lee26d.pdf},
  url = 	 {https://proceedings.mlr.press/v336/lee26d.html},
  abstract = 	 {We study linear TD(0) under Markovian sampling, where data are generated along a single trajectory. We provide high-probability guarantees for a plain \emph{unprojected} TD(0) algorithm with Polyak–Ruppert (PR) averaging, using a \emph{single} stepsize schedule $\eta_t \propto 1/(\tau_{\mathrm{mix}}\log(t)\,\sqrt{t})$ that depends on mixing time but requires \emph{no prior knowledge of the curvature parameter $\omega$}. Our first result shows that such a choice of the stepsize guarantees that the TD(0) iterates are automatically and uniformly bounded \emph{with high probability}, without projections and without any stability argument based on $\omega$. Building on this result, we establish a simultaneous high-probability convergence guarantee for the PR average: the same stepsize yields both a robust curvature-free $\widetilde{\mathcal O}(\tau_{\mathrm{mix}}/\sqrt{T})$ rate and a fast curvature-dependent $\widetilde{\mathcal O}(\tau_{\mathrm{mix}}^2/(\omega T))$ rate, with the bound taking the minimum of the two. The core technical ingredient is a Poisson-equation toolkit for geometrically mixing Markov chains, which decomposes Markov noise into a martingale term plus a controlled remainder and enables a new self-bounding inductive argument for pathwise stability.}
}


@InProceedings{pmlr-v336-levy26a,
  title = 	 {Self-Concordant Perturbations for Linear Bandits},
  author =       {L{\'e}vy, Lucas and Valeau, Jean{-}Lou and Akhavan, Arya and Rebeschini, Patrick},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4635--4673},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/levy26a/levy26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/levy26a.html},
  abstract = 	 {We consider the adversarial linear bandits setting and present a unified algorithmic framework that bridges Follow-the-Regularized-Leader (FTRL) and Follow-the-Perturbed-Leader (FTPL) methods, extending the known connection between them from the full-information setting. Within this framework, we introduce self-concordant perturbations, a family of probability distributions that mirror the role of self-concordant barriers previously employed in the FTRL-based SCRiBLe algorithm. Using this idea, we design a novel FTPL-based algorithm that combines self-concordant regularization with efficient stochastic exploration. Our approach achieves a regret of $\mathcal{O}(d\sqrt{n \ln n})$ on both the $d$-dimensional hypercube and the $\ell_2$ ball. On the $\ell_2$ ball, this matches the rate attained by SCRiBLe. For the hypercube, this represents a $\sqrt{d}$ improvement over these methods and matches the optimal bound up to logarithmic factors.}
}


@InProceedings{pmlr-v336-li26a,
  title = 	 {Second-Order Bounds for $[0,1]$-Valued Regression via Betting Loss},
  author =       {Li, Yinan and Yoon, Sungjoon and Huang, Ethan and Jun, Kwang-Sung},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4674--4721},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/li26a/li26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/li26a.html},
  abstract = 	 {We consider the $[0,1]$-valued regression problem in the stochastic setting. In a related problem called cost-sensitive classification, Foster and Krishnamurthy (2021) have shown that the log loss minimizer achieves an improved generalization bound compared to that of the squared loss minimizer in the sense that the bound scales with the cost of the best classifier, which can be arbitrarily small depending on the problem instance. Such a result is often called a first-order bound. For $[0,1]$-valued regression, we first show that the log loss minimizer leads to a similar first-order bound. We then ask if there exists a loss function that achieves a variance-dependent bound, also known as a second-order bound, which is a strict improvement upon first-order bounds. We answer this question in the affirmative by proposing a novel loss function called betting loss. Our result is variance-adaptive in the sense that the bound is attained by an algorithm without any knowledge about the variance, which is in contrast to the existing works such as weighted least squares with known variances or those that model label variance or its distribution such as distributional reinforcement learning.}
}


@InProceedings{pmlr-v336-li26b,
  title = 	 {Optimal Learning Rate Schedules under Functional Scaling Laws: Power Decay and Warmup–Stable–Decay (Extended Abstract)},
  author =       {Li, Binghui and Wang, Zilin and Chen, Fengling and Zhao, Shiyang and Zheng, Ruiheng and Wu, Lei},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4722--4723},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/li26b/li26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/li26b.html},
  abstract = 	 {We study optimal learning rate (LR) schedules under the functional scaling law (FSL) framework (Li et al., 2025), where loss dynamics are controlled by a source exponent $s>0$ for signal learning and a capacity exponent $\beta>1$ for noise forgetting. For a fixed training horizon $N$, we characterize the schedules that minimize the final-step loss under natural stability constraints and reveal a sharp phase transition. In the easy-task regime $s \ge 1 - 1/\beta$, the optimal schedule takes the power-decay form $\eta^*(z) = \eta_{\mathrm{peak}}(1 - z/N)^{2\beta - 1}$ with $\eta_{\mathrm{peak}}\asymp N^{-(s-1+1/\beta)/(s+1/\beta)}$. In contrast, in the hard-task regime $s < 1 - 1/\beta$, the optimal schedule exhibits a warmup–stable–decay (WSD)-like (Hu et al., 2024) structure: it maintains the largest admissible LR for most of training and decays only near the end, with the decay phase occupying a vanishing fraction of the horizon. We next study the practical setting where the decay shape is fixed and only the peak LR is tuned. To separate these two design choices, we introduce a family of fractional LR schedules that decouple peak-LR tuning from decay-shape design. We prove that fixed-shape schedules suffer from capacity saturation: each shape can adapt to the capacity exponent only up to a shape-dependent threshold, beyond which the achievable convergence rate no longer improves. This yields a principled criterion for evaluating commonly used schedules such as cosine and linear decay, revealing both their strengths and limitations. We then apply the FSL-optimal power-decay schedule to one-pass stochastic gradient descent (SGD) for kernel regression and show that the last iterate attains the exact minimax-optimal convergence rate, eliminating the logarithmic gap  in prior analyses. Finally, experiments validate our theoretical predictions in controlled settings and illustrate their usefulness for practical LR-schedule design in neural network training.}
}


@InProceedings{pmlr-v336-liu26a,
  title = 	 {Fast algorithms for learning a Gaussian under halfspace truncation with optimal sample complexity},
  author =       {Liu, Haitong and Sridharan, Deepak Narayanan and Steurer, David and Wiedmer, Manuel},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4724--4818},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/liu26a/liu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/liu26a.html},
  abstract = 	 {We study the fundamental problem of learning a high-dimensional Gaussian truncated to an unknown halfspace. Lee, Mehrotra and Zampetakis (FOCS’24) recently obtained the first polynomial time algorithm for this problem, but their resulting sample and time complexity bounds are not optimal. Under non-trivial truncation, for any target accuracy $\varepsilon > 0$ and dimension $d$ we give an efficient algorithm that uses $n = \tilde{O}(d^2/\varepsilon^2)$ samples and learns the underlying Gaussian to error $\varepsilon$ in total variation distance. Our algorithm is also fast: its runtime is dominated by the cost of computing the empirical covariance matrix. Both our sample and time complexity are optimal in terms of $d$ and $\varepsilon$ even \emph{without} truncation: in this regard, we can learn a Gaussian under halfspace truncation for free. The key ingredient behind our result is a novel reinterpretation of the low-degree moments of the truncated Gaussian in terms of a relative truncation parameter. This relative truncation parameter uniquely determines the parameters of the untruncated Gaussian and enables direct parameter recovery. This reinterpretation allows us to circumvent the time intensive projected stochastic gradient descent procedure that is widely used in learning under truncation.}
}


@InProceedings{pmlr-v336-liu26b,
  title = 	 {Online Learning for Uninformed Markov Games: Empirical Nash-Value Regret and Non-Stationarity Adaptation},
  author =       {Liu, Junyan and Luo, Haipeng and Zhang, Zihan and Ratliff, Lillian J.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4819--4856},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/liu26b/liu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/liu26b.html},
  abstract = 	 {We study online learning in two-player uninformed Markov games, where the opponent’s actions and policies are unobserved. In this setting, Tian et al. (2021) show that achieving no-external-regret is impossible without incurring an exponential dependence on the episode length $H$.  They then turn to the weaker notion of Nash-value regret and propose a V-learning algorithm with regret $\widetilde{O}(K^{2/3})$ after $K$ episodes. However, their algorithm and guarantee do not adapt to the difficulty of the problem: even in the case where the opponent follows a fixed policy and thus $\widetilde{O}(\sqrt{K})$ external regret is well-known to be achievable, their result is still the \textit{worse} rate $\widetilde{O}(K^{2/3})$ on a \textit{weaker} metric. In this work, we fully address both limitations. First, we introduce \textit{empirical Nash-value regret},  a new regret notion that is strictly stronger than Nash-value regret and naturally reduces to external regret when the opponent follows a fixed policy. Moreover, under this new metric, we propose a parameter-free algorithm that achieves an $\widetilde{O} \big(\min{\sqrt{K} + (CK)^{1/3}, \sqrt{LK}}\big)$ regret bound, where $C$ quantifies the “variance” of the opponent’s policies and $L$ denotes the number of policy switches (both at most $O(K)$).  Therefore, our results not only recover the two extremes—$\widetilde{O}(\sqrt{K})$ external regret when the opponent is fixed and $\widetilde{O}(K^{2/3})$ Nash-value regret in the worst case—but also smoothly interpolate between these extremes by automatically adapting to the opponent’s non-stationarity. We achieve so by first providing a new analysis of the epoch-based V-learning algorithm by Mao et al. (2022), establishing an $\widetilde{O}(\eta C + \sqrt{K/\eta})$ regret bound, where $\eta$ is the epoch incremental factor. Next, we show how to adaptively restart this algorithm with an appropriate $\eta$ in response to the potential non-stationarity of the opponent, eventually achieving our final results.}
}


@InProceedings{pmlr-v336-liu26c,
  title = 	 {Regret Minimization with Adaptive Opponents in Repeated Games},
  author =       {Liu, Mingyang and Ozdaglar, Asuman and Yu, Tiancheng and Zhang, Kaiqing},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4857--4858},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/liu26c/liu26c.pdf},
  url = 	 {https://proceedings.mlr.press/v336/liu26c.html},
  abstract = 	 {In this paper, we study regret minimization in repeated games with \emph{adaptive} opponents whose strategies may depend on the histories of play. The classical online learning metric of \emph{external regret} does not fully capture such adaptivity, since it compares against decisions while treating the loss sequence as fixed. To account for the counterfactual reasoning of the players, we introduce a new metric, \texttt{Repeated Policy Regret (RP-Regret)}, specific to this game-theoretic setting, which measures the difference between the \emph{realized} and the \emph{best-in-hindsight} accumulated utility when all players can \emph{respond} to the history of play. Compared with existing regret notions in adaptive environments, \texttt{RP-Regret} allows stronger dynamic comparators and less restricted opponents, while still enabling the learning of better equilibria when all players minimize it. We first identify necessary conditions for achieving sublinear \texttt{RP-Regret}. The comparator strategies must have sublinear accumulated variation, and both the comparator and the opponents must have imperfect recall. Without these conditions, sublinear \texttt{RP-Regret} is impossible to achieve in general. We then provide additional sufficient conditions and algorithms for minimizing \texttt{RP-Regret}. A key challenge is that \texttt{RP-Regret} is \emph{nonconvex} in the strategy space by definition. We address this challenge through three approaches. The first approach uses a nonconvex optimization oracle, as in prior work on online nonconvex learning. The second approach minimizes a convex \emph{linearized}  surrogate at each iteration, which yields the minimization of a local variant of \texttt{RP-Regret}. The third approach directly minimizes \texttt{RP-Regret} when the opponents change their strategies slowly, by reformulating the repeated game as a Markov game and optimizing over occupancy measures. Finally, we show that when all players can run algorithms to minimize the \texttt{RP-Regret} or its linearized variant, certain subgame-perfect equilibria of the repeated game can be learned. We also provide experiments to show that these regret notions can lead to more cooperative outcomes with higher utility in games such as the Stag Hunt.}
}


@InProceedings{pmlr-v336-liu26d,
  title = 	 {Random Reshuffling Dominates Stochastic Gradient Descent},
  author =       {Liu, Zijian},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4859--4882},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/liu26d/liu26d.pdf},
  url = 	 {https://proceedings.mlr.press/v336/liu26d.html},
  abstract = 	 {Stochastic Gradient Descent ({\textsf{SGD}}) is one of the most classical optimization algorithms with favorable theoretical guarantees, yet the practical implementation of {\textsf{SGD}} differs subtly from its well-known form and is often referred to as Shuffling Stochastic Gradient Descent ({\textsf{Shuffling SGD}}). A particularly popular strategy in {\textsf{Shuffling SGD}} is Random Reshuffling ({\textsf{RR}}), which has achieved great empirical success across numerous experiments. Despite its strong performance, {\textsf{RR}} has long been considered a heuristic due to a lack of theoretical support. Over the last decade, people have finally established provable convergence rates for {\textsf{RR}}, thus justifying its observed superiority. However, for smooth convex optimization, two clouds over the convergence theory of {\textsf{RR}} remain to this day. More precisely, according to the current theory, {\textsf{Shuffling SGD}} under {\textsf{RR}} converges only when the stepsize is smaller than a threshold proportional to $1/n$, where $n$ is the number of summands in the objective (or the number of data points). Consequently, the optimally tuned theoretical rate of {\textsf{Shuffling SGD}} under {\textsf{RR}} is strictly worse than that of {\textsf{SGD}} when the number of epochs is smaller than another threshold proportional to $n$. These two restrictions heavily limit the applicability of existing theories and leave a critical mismatch with practice. In this work, for the first time, we prove that {\textsf{RR}} dominates {\textsf{SGD}} in smooth convex optimization under any reasonable stepsize after any finite number of epochs, thereby addressing a longstanding open question.}
}


@InProceedings{pmlr-v336-luo26a,
  title = 	 {Wedge Sampling: Efficient Tensor Completion with Nearly-Linear Sample Complexity},
  author =       {Luo, Hengrui and Ma, Anna and Stephan, Ludovic and Zhu, Yizhe},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4883--4884},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/luo26a/luo26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/luo26a.html},
  abstract = 	 {We introduce \emph{Wedge Sampling}, a new non-adaptive sampling scheme for low-rank tensor completion. We study recovery of an order-$k$ low-rank tensor of dimension $n\times\cdots\times n$ from a subset of its entries. Unlike the standard uniform entry model (i.e., i.i.d. samples from $[n]^k$), wedge sampling allocates observations to structured length-two patterns (wedges) in an associated bipartite sampling graph. By directly promoting these length-two connections, the sampling design strengthens the spectral signal that underlies efficient initialization, in regimes where uniform sampling is too sparse to generate enough informative correlations. Our main result shows that this change in sampling paradigm enables polynomial-time algorithms to achieve both weak and exact recovery with nearly linear sample complexity in $n$. The approach is also plug-and-play: wedge-sampling–based spectral initialization can be combined with existing refinement procedures (e.g., spectral or gradient-based methods) using only an additional $\tilde O(n)$ uniformly sampled entries, substantially improving over the $\tilde O(n^{k/2})$ sample complexity typically required under uniform entry sampling for efficient methods. Overall, our results suggest that the statistical-to-computational gap highlighted by Barak and Moitra [Mathematical Programming, 193(2):513–548, 2022] is, to a large extent, a consequence of the uniform entry sampling model for tensor completion, and alternative non-adaptive measurement designs that guarantee a strong initialization can overcome this barrier.}
}


@InProceedings{pmlr-v336-ma26a,
  title = 	 {Polynomial-time sampling despite disorder chaos},
  author =       {Ma, Eric and Schramm, Tselil},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4885--4910},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ma26a/ma26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ma26a.html},
  abstract = 	 {A distribution over instances of a sampling problem is said to exhibit transport disorder chaos if perturbing the instance by a small amount of random noise dramatically changes the stationary distribution (in Wasserstein distance). Seeking to provide evidence that some sampling tasks are hard on average, a recent line of work has demonstrated that disorder chaos is sufficient to rule out “stable” sampling algorithms, such as gradient methods and some diffusion processes. We demonstrate that disorder chaos does not preclude polynomial-time sampling by canonical algorithms in canonical models. We show that with high probability over a random graph $\mathbf{G} \sim G(n,1/2)$: (1) the hardcore model (at fugacity $\lambda = 1$) on $\mathbf{G}$ exhibits disorder chaos, and (2) Glauber dynamics run for $O(n)$ time can approximately sample from the hardcore model on $\mathbf{G}$ (in Wasserstein distance).}
}


@InProceedings{pmlr-v336-maiti26a,
  title = 	 {On the Power  of Adaptivity for $\varepsilon$-Best Arm Identification in Linear Bandits},
  author =       {Maiti, Arnab and Xu, Yunbei and Jamieson, Kevin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4911--4968},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/maiti26a/maiti26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/maiti26a.html},
  abstract = 	 {We study the minimax sample complexity of $\varepsilon$-best arm identification in linear bandits, a classical pure-exploration problem. Given a compact action set $\mathcal{X}$ that spans $\mathbb{R}^d$ and an unknown reward vector $\theta\in\mathbb{R}^d$, the goal is to output an arm $\widehat{x}\in\mathcal{X}$ such that $⟨\widehat{x},\theta⟩\ge \max_{x\in\mathcal{X}} ⟨x,\theta⟩- \varepsilon$ with probability at least $1-\delta$, using as few samples as possible. Our aim is to better understand the power and limitations of adaptivity in this setting. We begin with non-adaptive algorithms. We present a non-adaptive fixed-design method with sample complexity $\mathcal{O}\!\left(\frac{d\log(1/\delta)}{\varepsilon^2}+\frac{w(\mathcal{X})^2}{\varepsilon^2}\right)$, where $w(\mathcal{X})$ is a Gaussian width term dependent on $\mathcal{X}$, and we prove a matching lower bound $\Omega\!\left(\frac{d\log(1/\delta)}{\varepsilon^2}+\frac{w(\mathcal{X})^2}{\varepsilon^2}\right)$ for all non-adaptive fixed-design methods. Moreover, $w(\mathcal{X})\le \mathcal{O}(d)$ for general $\mathcal{X}$, which is tight for sets such as the unit $\ell_2$ ball, and $w(\mathcal{X})\le \mathcal{O}(\sqrt{d\log|\mathcal{X}|})$ when $\mathcal{X}$ is finite, which is tight for the canonical basis ${e_1,\ldots,e_d}$. We then turn to adaptive sampling. For any finite action set $\mathcal{X}$, we prove the existence of an adaptive algorithm with sample complexity $\mathcal{O}\!\left(\frac{d\log(1/\delta)}{\varepsilon^2}+\frac{d\log(|\mathcal{X}|/d)}{\varepsilon^2}\right)$ via a generalization of Median Elimination, which is known to yield a $\log d$ improvement for the canonical basis. This raises a structural question: beyond the canonical basis, are there structured action sets for which adaptivity yields only logarithmic-factor improvements over the optimal non-adaptive rate? We answer in the affirmative for several natural action sets, namely the hypercube, the $\ell_2$ ball, $m$-sets, and multi-task multi-armed bandits. Finally, we show that logarithmic improvements are not the whole story. To our knowledge, we provide the first construction of an action set $\mathcal{X}$ for which adaptivity yields a \emph{polynomial-factor improvement} over every non-adaptive algorithm. A key ingredient behind this separation is an $\ell_2$-norm estimation subroutine: we design an adaptive algorithm that uses $\mathcal{O}\!\left(\frac{d\log(1/\delta)}{\varepsilon^2}\right)$ samples from the unit $\ell_2$ ball in $\mathbb{R}^d$ and outputs an estimate $\widehat r$ satisfying $|\widehat r-\|\theta\|_2|\le \varepsilon$ with probability at least $1-\delta$, where $\theta$ is the unknown reward vector. Taken together, these results illustrate when adaptivity can offer only modest savings and when it can enable genuine polynomial gains, sharpening our understanding of the role of adaptivity and geometry in pure exploration and experimental design.}
}


@InProceedings{pmlr-v336-maran26a,
  title = 	 {Online Market Making and the Value of Observing the Order Book},
  author =       {Maran, Davide and Restelli, Marcello},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4969--4998},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/maran26a/maran26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/maran26a.html},
  abstract = 	 {We study an online market-making problem in which a learner sequentially posts bid and ask prices for a single asset while interacting with traders holding private valuations. Unlike existing online learning formulations that assume fully censored feedback, we introduce an action-dependent feedback model inspired by real limit order books: when a trade occurs, the trader’s valuation remains hidden, whereas when no trade occurs, informative feedback about supply and demand is revealed.  We show that this additional information fundamentally changes the learnability of the problem. In the stochastic setting with i.i.d. market prices, we propose an elimination-based algorithm that achieves $\widetilde O(\sqrt{T})$ regret with high probability, without requiring any smoothness assumptions on the distribution of trader valuations. We then extend this result to a broad class of mean-reverting price processes by considering both local, autoregressive dynamics and a weaker global drift condition based on cumulative deviations from the mean. Under either assumption, we establish high-probability $\widetilde O(\sqrt{T})$ regret bounds, relying on a new concentration inequality of independent interest. Finally, in the adversarial setting with oblivious prices, we design an explore-then-perturb algorithm that guarantees $\tilde O(T^{2/3})$ regret in expectation. Our results quantify the value of observing the order book in online market making and demonstrate that even limited, action-dependent feedback can substantially improve regret guarantees compared to standard bandit feedback models.}
}


@InProceedings{pmlr-v336-massoulie26a,
  title = 	 {Phase Transition in Convex Relaxations for Graph Alignment},
  author =       {Massouli\'e, Laurent and Varma, Sushil Mahavir and Vassaux, Louis and Waldspurger, Ir\`ene},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {4999--5020},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/massoulie26a/massoulie26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/massoulie26a.html},
  abstract = 	 {We study the graph alignment problem for correlated Gaussian Orthogonal Ensemble (GOE) matrices, where the goal is to recover a hidden vertex permutation given two correlated symmetric Gaussian matrices $(A,B)$ with correlation $1/\sqrt{1+\sigma^2}$. While the maximum likelihood estimator is information-theoretically optimal, its computation, which reduces to a quadratic assignment problem, is intractable. Motivated by this, we analyze convex relaxations based on minimizing $\|AX - XB\|_F$ over the set of doubly stochastic matrices and the unit hypercube. We show that when the correlation parameter satisfies $\sigma = o(n^{-1/2}/\log^4 n)$, the solution of either relaxation ($X^\star$) concentrates around the ground-truth permutation matrix ($\Pi^\star$), i.e., $\|X^\star - \Pi^\star\|_F^2 = o(n)$, implying recovery of all but a vanishing fraction of vertices after simple post-processing. Combined with existing lower bounds, our results precisely characterize that $\|X^\star - \Pi^\star\|_F^2$ transitions from $o(n)$ for $\sigma = \tilde{o}(n^{-1/2})$ to $\Omega(n)$ for $\sigma = \tilde{\Omega}(n^{-1/2})$. In doing so, our analysis significantly tightens prior results and extends them beyond doubly stochastic relaxations.}
}


@InProceedings{pmlr-v336-maynard-zhang26a,
  title = 	 {On The Complexity of Best-Arm Identification in Non-Stationary Linear Bandits},
  author =       {Maynard-Zhang, Leo and Xiong, Zhihan and Jamieson, Kevin and Fazel, Maryam},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5021--5052},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/maynard-zhang26a/maynard-zhang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/maynard-zhang26a.html},
  abstract = 	 {We study the fixed-budget best-arm identification (BAI) problem in non-stationary linear bandits.  Concretely, given a fixed time budget $T\in \mathbb{N}$, finite arm set $\mathcal{X} \subset \mathbb{R}^d$, and a potentially adversarial sequence of unknown parameters $\lbrace \theta_t\rbrace_{t=1}^{T}$ (hence non-stationary), a learner aims to identify the arm with the largest cumulative reward $x_* = \arg\max_{x \in \mathcal{X}} x^\top\sum_{t=1}^T \theta_t$ with high probability. In this setting, it is well-known that i.i.d. sampling arms from the G-optimal design yields a minimax-optimal error probability of $\exp\left(-\Theta\left(T /  H_{G}\right)\right)$, where $H_{G}$ scales proportionally with the dimension $d$. However, this notion of complexity is overly pessimistic, as it is derived from a lower bound in which the arm set consists only of the standard basis vectors, thus masking any potential advantages arising from arm sets with richer geometric structure. To address this, we establish an \textit{arm-set-dependent} lower bound that, in contrast, holds for any arm set. Motivated by the ideas underlying our lower bound, we propose the \textit{Adjacent-optimal design}, a specialization of the well-known $\mathcal{XY}$-optimal design, and develop the \textsf{Adjacent-BAI} algorithm. We prove that the error probability of \textsf{Adjacent-BAI} matches our lower bound up to constants, verifying the tightness of our lower bound, and establishing the arm-set-dependent complexity of this setting.}
}


@InProceedings{pmlr-v336-mehrotra26a,
  title = 	 {Language Generation with Infinite Contamination},
  author =       {Mehrotra, Anay and Velegkas, Grigoris and Yu, Xifan and Zhou, Felix},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5053--5112},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/mehrotra26a/mehrotra26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/mehrotra26a.html},
  abstract = 	 {A recent line of work studies language generation in the limit, a formal model of language learning where an algorithm observes an adversarially generated enumeration of strings from an unknown target language $K$ and must eventually generate new, unseen strings from $K$. In this model,  Kleinberg and Mullainathan (2024) proved that generation is achievable in surprisingly general settings; whenever $K$ belongs to a known countable collection of languages. However, their generator, while quite general, suffers from “mode collapse:” it generates from an ever-smaller subset of the target. To address this, Kleinberg and Wei (2025a) introduced a stronger notion of dense generation, requiring the output to asymptotically cover a positive fraction of the target, and showed it remains achievable for all countable collections. Both of these works rely on the crucial assumption of \textit{perfect} data: the adversary can neither insert strings from outside the target language (i.e., noise) nor omit strings from it (i.e., omissions). In practice, training data for language models is notoriously noisy, raising the fundamental question: \begin{center} \emph{How much contamination (either omissions or insertions) can language generation tolerate?} \end{center} Recent works have made partial progress on this question by studying (non-dense) generation with either finite amounts of noise (but no omissions) (Raman and Raman, 2025) or omissions (but no noise) (Bai et al., 2026). We characterize the contamination tolerance of both types of generation by proving the following results: \begin{itemize} \item \textbf{Generation under Contamination:} Language generation in the limit is achievable for all countable collections if and only if the fraction of contaminated examples converges to zero. When this condition fails, we characterize the collections which remain generable. \item \textbf{Dense Generation under Contamination:} Dense generation is achievable for all countable collections if and only if the amount of contamination is finite. For an infinite amount of contamination, we provide several characterizations of when dense generation is possible, showing it is strictly less robust than standard generation. \end{itemize} As a byproduct, we also resolve an open question of (Raman and Raman, 2025) on generation with membership oracle access under finite contamination.}
}


@InProceedings{pmlr-v336-mehrotra26b,
  title = 	 {Differentially Private Language Generation and Identification in the Limit (Extended Abstract)},
  author =       {Mehrotra, Anay and Velegkas, Grigoris and Yu, Xifan and Zhou, Felix},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5113--5114},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/mehrotra26b/mehrotra26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/mehrotra26b.html},
  abstract = 	 {We initiate the study of language generation in the limit, a model recently introduced by Kleinberg and Mullainathan (2024), under the constraint of differential privacy. We consider the \emph{continual release} model, where a generator must eventually output a stream of valid strings while protecting the privacy of the entire input sequence. Our first main result is that for countable collections of languages, privacy comes at no qualitative cost: we provide an $\varepsilon$-differentially-private algorithm that generates in the limit from \emph{any} countable collection. This stands in contrast to many learning settings where privacy renders learnability impossible. However, privacy does impose a quantitative cost: there are finite collections of size $k$ for which uniform private generation requires $\Omega(k/\varepsilon)$ samples, whereas just one sample suffices non-privately. We then turn to the harder problem of language \emph{identification} in the limit. Here, we show that privacy creates fundamental barriers. We prove that no $\varepsilon$-DP algorithm can identify a collection containing two languages with an infinite intersection and a finite set difference, a condition far stronger than the classical non-private characterization of identification. Next, we turn to the \emph{stochastic} setting where the sample strings are sampled i.i.d. from a distribution (instead of being generated by an adversary). Here, we show that private identification is possible if and only if the collection is identifiable in the adversarial model. Together, our results establish new dimensions along which generation and identification differ and, for identification, a separation between adversarial and stochastic settings induced by privacy constraints.}
}


@InProceedings{pmlr-v336-menart26a,
  title = 	 {On the Gradient Complexity of Private Optimization with Private Oracles},
  author =       {Menart, Michael and Nikolov, Aleksandar},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5115--5158},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/menart26a/menart26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/menart26a.html},
  abstract = 	 {We study the running time, in terms of first order oracle queries, of differentially private empirical/population risk minimization of Lipschitz convex losses. We primarily consider the setting where the loss is non-smooth and the optimizer interacts with a private proxy oracle, which sends only private messages about a minibatch of gradients. In this setting, we show that expected running time $\Omega(\min{\frac{\sqrt{d}}{\alpha^2}, \frac{d}{\log(1/\alpha)}})$ is necessary to achieve $\alpha$ excess risk on problems of dimension $d$ when $d \geq 1/\alpha^2$. Upper bounds via DP-SGD show these results are tight when $d>\tilde{\Omega}(1/\alpha^4)$. In fact, the lower bound nearly matches the best known upper bound for general private optimizers in this regime. A consequence of our results is that, in high dimensions, the ubiquitous DP-SGD algorithm necessarily suffers a dimension dependent runtime slowdown and further that DP-SGD is optimal among the subclass of DP optimizers that use private oracles. We further show our lower bound can be strengthened to $\Omega(\min{\frac{d}{\bar{m}\alpha^2}, \frac{d}{\log(1/\alpha)} })$ for algorithms which use minibatches of size at most $\bar{m} < \sqrt{d}$. We next consider smooth losses, where we relax the private oracle assumption and give lower bounds under only the condition that the optimizer is private. Here, we lower bound the expected number of first order oracle calls by $\tilde{\Omega}\big(\frac{\sqrt{d}}{\alpha} + \min{\frac{1}{\alpha^2}, n}\big)$, where $n$ is the size of the dataset. Modifications to existing algorithms show this bound is nearly tight. To our knowledge, ours are the first oracle complexity lower bounds to leverage differential privacy beyond the local privacy model. Compared to non-private lower bounds, our results show that differentially private optimizers pay a dimension dependent runtime penalty. Finally, as a natural extension of our proof technique, we show lower bounds in the non-smooth setting for optimizers interacting with information limited oracles. If the proxy oracle transmits at most $\Gamma$-bits of information about the gradients in the minibatch, then $\Omega\big(\min{\frac{d}{\alpha^2\Gamma}, \frac{d}{\log(1/\alpha)}}\big)$ oracle calls are needed. This result shows fundamental limitations of gradient quantization techniques in optimization.}
}


@InProceedings{pmlr-v336-menon26a,
  title = 	 {On the implicit regularization of Langevin dynamics with projected noise},
  author =       {Menon, Govind and Stromme, Austin and Vacher, Adrien},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5159--5187},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/menon26a/menon26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/menon26a.html},
  abstract = 	 {We study Langevin dynamics with noise projected onto the directions orthogonal to an isometric group action. This mathematical model is introduced to shed new light on the effects of symmetry on stochastic gradient descent for over-parametrized models. Our main result identifies a novel form of implicit regularization: when the initial and target density are both invariant under the group action, Langevin dynamics with projected noise is equivalent in law to Langevin dynamics with isotropic diffusion but with an additional drift term proportional to the negative log volume of the group orbit. We prove this result by constructing a coupling of the two processes via a third process on the group itself, and identify the additional drift as the mean curvature of the orbits.}
}


@InProceedings{pmlr-v336-moitra26a,
  title = 	 {Steering diffusion models with quadratic rewards: a fine-grained analysis},
  author =       {Moitra, Ankur and Risteski, Andrej and Rohatgi, Dhruv},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5188--5209},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/moitra26a/moitra26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/moitra26a.html},
  abstract = 	 { Inference-time algorithms are an emerging paradigm in which pre-trained models are used as subroutines to solve downstream tasks. Such algorithms have been proposed for tasks ranging from inverse problems and guided image generation to reasoning. However, the methods currently deployed in practice are heuristics with a variety of failure modes—and we have very little understanding of when these heuristics can be efficiently improved.    In this paper, we consider the task of sampling from a reward-tilted diffusion model—that is, sampling from $p^{\star}(x) \propto p(x) \exp(r(x))$—given a reward function $r$ and pre-trained diffusion oracle for $p$. We provide a fine-grained analysis of the computational tractability of this task for quadratic rewards $r(x) = x^\top A x + b^\top x$. We show that linear-reward tilts are always efficiently sampleable—a simple result that seems to have gone unnoticed in the literature. We use this as a building block, along with a conceptually new ingredient—the Hubbard-Stratonovich transform—to provide an efficient algorithm for sampling from low-rank positive-definite quadratic tilts, i.e. $r(x) = x^\top A x$ where $A$ is positive-definite and of rank $O(1)$. For negative-definite tilts, i.e. $r(x) = - x^\top A x$ where $A$ is positive-definite, we prove that the problem is intractable even if $A$ is of rank 1 (albeit with exponentially-large entries). }
}


@InProceedings{pmlr-v336-mulayoff26a,
  title = 	 {On the Stability of Nonlinear Dynamics in GD and SGD: Beyond Quadratic Potentials},
  author =       {Mulayoff, Rotem and Stich, Sebastian U.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5210--5243},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/mulayoff26a/mulayoff26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/mulayoff26a.html},
  abstract = 	 {The dynamical stability of the iterates during training plays a key role in determining the minima obtained by optimization algorithms. For example, stable solutions of gradient descent (GD) correspond to flat minima, which have been associated with favorable features. While prior work often relies on linearization to determine stability, it remains unclear whether linearized dynamics faithfully capture the full nonlinear behavior. Recent work has shown that GD may stably oscillate near a linearly unstable minimum and still converge once the step size decays, indicating that linear analysis can be misleading. In this work, we explicitly study the effect of nonlinear terms. Specifically, we derive an exact criterion for stable oscillations of GD near minima in the multivariate setting. Our condition depends on high-order derivatives, generalizing existing results. Extending the analysis to stochastic gradient descent (SGD), we show that nonlinear dynamics can diverge in expectation even if a single batch is unstable. This implies that stability can be dictated by a single batch that oscillates unstably, rather than an average effect, as linear analysis suggests. Finally, we prove that if all batches are linearly stable, the nonlinear dynamics of SGD are stable in expectation.}
}


@InProceedings{pmlr-v336-nagler26a,
  title = 	 {Optimal Neural Network Approximation of Smooth Compositional Functions on Sets with Low Intrinsic Dimension},
  author =       {Nagler, Thomas and Langer, Sophie},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5244--5272},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/nagler26a/nagler26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/nagler26a.html},
  abstract = 	 {We study approximation and statistical learning properties of deep ReLU networks under structural assumptions that mitigate the curse of dimensionality. We prove minimax-optimal uniform approximation rates for $s$-Hölder smooth functions defined on sets with low Minkowski dimension using fully connected networks with flexible width and depth, improving existing results by logarithmic factors even in classical full-dimensional settings. A key technical ingredient is a new memorization result for deep ReLU networks that enables efficient point fitting with dense architectures. We further introduce a class of compositional models in which each component function is smooth and acts on a domain of low intrinsic dimension. This framework unifies two common assumptions in the statistical learning literature, structural constraints on the target function and low dimensionality of the covariates, within a single model. We show that deep networks can approximate such functions at rates determined by the most difficult function in the composition. As an application, we derive improved convergence rates for empirical risk minimization in nonparametric regression that adapt to smoothness, compositional structure, and intrinsic dimensionality.}
}


@InProceedings{pmlr-v336-nerem26a,
  title = 	 {Graph neural networks extrapolate out-of-distribution for shortest paths},
  author =       {Nerem, Robert R. and Chen, Samantha and Dasgupta, Sanjoy and Wang, Yusu},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5273--5331},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/nerem26a/nerem26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/nerem26a.html},
  abstract = 	 {Neural networks (NNs), despite their success and wide adoption, still struggle to extrapolate out-of-distribution (OOD), i.e., to inputs that are not well-represented by their training dataset. Addressing the OOD generalization gap is crucial when models are deployed in environments significantly different from the training set, such as applying Graph Neural Networks (GNNs) trained on small graphs to large, real-world graphs. One promising approach for achieving robust OOD generalization is the framework of neural algorithmic alignment, which incorporates ideas from classical algorithms by designing neural architectures that resemble specific algorithmic paradigms (e.g. dynamic programming). The hope is that trained models of this form would have superior OOD capabilities, in much the same way that classical algorithms work for all instances. We employ sparsity regularization as a tool for analyzing the role of algorithmic alignment in achieving OOD generalization, focusing on graph neural networks (GNNs) applied to the canonical shortest path problem. We prove that if a trained GNN minimizes a sparsity-regularized loss over a small set of shortest-path instances, then the GNN implements $K$ steps of the Bellman-Ford algorithm for shortest paths. In fact, if a trained GNN minimizes this loss within an error of $\epsilon$, it computes $K$-step shortest path distances up to error $O(\epsilon)$. Our empirical results support our theory by showing that NNs trained by gradient descent are able to minimize this loss and extrapolate in practice.}
}


@InProceedings{pmlr-v336-peng26a,
  title = 	 {An Exponential Lower Bound for Spectral Density Estimation on Unweighted Graphs},
  author =       {Peng, Pan and Wang, Yuyang and Yang, Joy Qiping and Yang, Yichun},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5332--5357},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/peng26a/peng26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/peng26a.html},
  abstract = 	 {We study lower bounds for estimating the spectral density of the normalized adjacency matrix of a graph. Previously, Cohen-Steiner et al. [KDD 2018] proposed an algorithm for $\varepsilon$-approximate spectral density estimation in the Wasserstein-1 distance, using $2^{O(1/\varepsilon)}$ random walks initiated from uniformly random nodes in the graph. Later, Jin et al. [COLT 2023] established a nearly matching exponential lower bound for \emph{weighted} graphs, assuming the algorithm has access to samples from random walks started at random nodes. It was left open whether this lower bound could be extended to \emph{unweighted} graphs. In this paper, we answer this question in the affirmative by proving an exponential lower bound for unweighted graphs. Specifically, we show that no algorithm can compute an $\varepsilon$-approximation to the spectrum of a normalized graph adjacency matrix with constant success probability, even when given the full transcripts of $2^{\Omega(1/\varepsilon^{1/6})}$ random walks, each of length $2^{\Omega(1/\varepsilon^{1/6})}$, started from uniformly random nodes.}
}


@InProceedings{pmlr-v336-garg26a,
  title = 	 {How Many Features Can a Language Model Store Under the Linear Representation Hypothesis?},
  author =       {Garg, Nikhil and Kleinberg, Jon and Peng, Kenny},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5358--5376},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/garg26a/garg26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/garg26a.html},
  abstract = 	 {We introduce a mathematical framework for the linear representation hypothesis (LRH), which asserts that intermediate layers of language models store features linearly. We separate the hypothesis into two claims: linear \textit{representation} (features are linearly embedded in neuron activations) and linear \textit{accessibility} (features can be linearly decoded). We then ask: How many neurons $d$ suffice to both linearly represent and linearly access $m$ features? Classical results in compressed sensing imply that for $k$-sparse inputs, $d = O(k\log (m/k))$ suffices if we allow non-linear decoding algorithms (Candes and Tao, 2006; Candes et al., 2006; Donoho 2006). However, the additional requirement of linear decoding takes the problem out of the classical compressed sensing, into \textit{linear} compressed sensing. Our main theoretical result establishes nearly-matching upper and lower bounds for linear compressed sensing. We prove that $d = \Omega_\epsilon(\frac{k^2}{\log k}\log (m/k))$ is required while $d = O_\epsilon(k^2\log m)$ suffices. The lower bound establishes a quantitative gap between classical and linear compressed setting, illustrating how linear accessibility is a meaningfully stronger hypothesis than linear representation alone. The upper bound confirms that neurons can store an exponential number of features under the LRH, giving theoretical evidence for the “superposition hypothesis” (Elhage et al., 2022). The upper bound proof uses standard random constructions of matrices with approximately orthogonal columns. The lower bound proof uses rank bounds for near-identity matrices (Alon, 2003) together with Turán’s theorem (bounding the number of edges in clique-free graphs). We also show how our results do and do not constrain the geometry of feature representations and extend our results to allow decoders with an activation function and bias.}
}


@InProceedings{pmlr-v336-prairie26a,
  title = 	 {Boosting with List-Decodable Codes},
  author =       {Prairie, Addison and Tan, Li-Yang},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5377--5396},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/prairie26a/prairie26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/prairie26a.html},
  abstract = 	 {Boosting is a fundamental technique for generically improving the accuracy of learning algorithms (Schapire 1989). Existing boosting algorithms construct a strong learner  using $O(\log(\frac{1}{\epsilon})/\gamma^2)$ calls to a $\gamma$-advantage weak learner, and this round complexity is known to be optimal for generic boosters that succeed on all concept classes (Freund 1995). We show that this lower bound can be circumvented for concept classes that satisfy a mild closure property. Specifically, we present a new boosting algorithm that, for any class $\mathcal{F}$ closed under $O(\log \frac{1}{\gamma})$-\textsc{Xor}, strong learns $\mathcal{F}$ using $O(\log \frac{1}{\epsilon})$ calls to a $\gamma$-advantage weak learner and a single batch of $\Tilde{O}(\log(\frac{1}{\epsilon})/\gamma^2)$ additional  samples. Our algorithm arises from a new and simple connection between boosting and list-decodable codes. Viewing the target function as a message, we run the weak learner on its encoding and view the resulting weak hypothesis as a corrupted codeword. Feeding this corrupted codeword to a list decoder, we obtain a small list of candidate hypotheses, at least one of which is a strong hypothesis for the original function. Using additional samples, we identify and output this strong hypothesis.}
}


@InProceedings{pmlr-v336-qi26a,
  title = 	 {Deep Q-Learning on Hölder Spaces},
  author =       {Qi, Qian},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5397--5398},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/qi26a/qi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/qi26a.html},
  abstract = 	 {We study the operator-theoretic core of Q-learning in continuous-time stochastic control with continuous states and actions. In value-based reinforcement learning, each Q-learning or DQN update is built from a Bellman optimality target; our analysis isolates this target in a uniformly elliptic diffusion setting and studies its regularity and approximation complexity. Under Hölder-regular coefficients, we show that a Bellman update maps bounded inputs into an anisotropic regularity class: it smooths the state variable through parabolic regularization while preserving only Lipschitz dependence on the action variable. This identifies a compact family of Bellman iterates and motivates tensor-product neural-operator approximators adapted to the mixed regularity of the problem. We derive explicit approximation and resource bounds, including a stiffness–complexity trade-off as the time step $\delta \to 0$. The result is an operator-level theory for the Bellman targets underlying Q-learning in continuous stochastic control, rather than a convergence theorem for practical sampled DQN training.}
}


@InProceedings{pmlr-v336-qin26a,
  title = 	 {Taming the Monster Every Context: Complexity Measure and Unified Framework for Offline-Oracle Efficient Contextual Bandits},
  author =       {Qin, Hao and Zhang, Chicheng},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5399--5464},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/qin26a/qin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/qin26a.html},
  abstract = 	 {We propose an algorithmic framework, Offline Estimation to Decisions (OE2D), that reduces contextual bandit learning with general reward function approximation to offline regression. The framework allows near-optimal regret for contextual bandits with large action spaces with $O(\log T)$ calls to an offline regression oracle over $T$ rounds, and makes $O(\log\log T)$ calls when $T$ is known. The design of OE2D generalizes Falcon and its linear-reward version in that it chooses an action distribution that we term the “exploitative F-design” that simultaneously guarantees low regret and good coverage that trades off exploration and exploitation. Central to our regret analysis is a new complexity measure, the Decision-Offline Estimation Coefficient (DOEC), which we show is bounded in the bounded Eluder dimension per-context and smoothed regret settings. We also establish a relationship between DOEC and the Decision Estimation Coefficient (DEC), bridging the design principles of offline- and online-oracle efficient contextual bandit algorithms for the first time.}
}


@InProceedings{pmlr-v336-qiu26a,
  title = 	 {Near-Optimal Regret for Distributed Adversarial Bandits: A Black-Box Approach},
  author =       {Qiu, Hao and Zhang, Mengxiao and Cesa-Bianchi, Nicol{\`o}},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5465--5517},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/qiu26a/qiu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/qiu26a.html},
  abstract = 	 {We study distributed adversarial bandits, where $N$ agents cooperate to minimize the global average loss while observing only their own local losses. We show that the minimax regret for this problem is $\widetilde{\Theta}\Big(\sqrt{\left(\rho^{-1/2} + \frac{K}{N}\right)T}\Big)$, where $T$ is the horizon, $K$ is the number of actions, and $\rho$ is the spectral gap of the communication matrix. Our algorithm, based on a novel black-box reduction to bandits with delayed feedback, requires agents to communicate only through gossip. It achieves an upper bound that significantly improves over the previous best bound $\widetilde{\mathcal{O}}\left(\rho^{-1/3}(KT)^{2/3}\right)$ of Yi et al. We complement this result with a matching lower bound, showing that the problem’s difficulty decomposes into a communication cost $\rho^{-1/4}\sqrt{T}$ and a bandit cost $\sqrt{KT/N}$. We further demonstrate the versatility of our approach by deriving first-order and best-of-both-worlds bounds in the distributed adversarial setting. Finally, we extend our framework to distributed linear bandits in $\mathbb{R}^d$, obtaining a regret bound of $\widetilde{\mathcal{O}}\Big(\sqrt{\left(\rho^{-1/2} + \frac{1}{N}\right)dT}\Big)$, achieved with only $O(d)$ communication cost per agent and per round via a volumetric spanner.}
}


@InProceedings{pmlr-v336-rajaraman26a,
  title = 	 {Learning to Reason with Curriculum I: Provable Benefits of Autocurriculum},
  author =       {Rajaraman, Nived and Huang, Audrey and Dudik, Miro and Schapire, Rob and Foster, Dylan and Krishnamurthy, Akshay},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5518--5555},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/rajaraman26a/rajaraman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/rajaraman26a.html},
  abstract = 	 {Chain-of-thought reasoning, where language models expend additional computation by producing thinking tokens prior to final responses, has driven significant advances in model capabilities. However, training these reasoning models is extremely costly in terms of both data and compute, as it involves collecting long traces of reasoning behavior from humans or synthetic generators and further post-training the model via reinforcement learning. Are these costs fundamental, or can they be reduced through better algorithmic design? We show that \textit{autocurriculum}—where the model uses its own performance to decide which problems to focus training on—provably improves upon standard training recipes for both supervised fine-tuning (SFT) and reinforcement learning (RL). For SFT, we show that autocurriculum requires \textit{exponentially} fewer reasoning demonstrations than non-adaptive fine-tuning (Joshi et al., 2025), by focusing teacher supervision on prompts where the current model struggles. For RL fine-tuning, autocurriculum \textit{decouples} the computational cost from the quality of the reference model, reducing the latter to a burn-in cost that is nearly independent of the target accuracy. These improvements arise purely from adaptive data selection, drawing on classical techniques from boosting (Freund and Schapire, 1997) and learning from counterexamples (Angluin, 1987), and requiring no assumption on the distribution or difficulty of prompts.}
}


@InProceedings{pmlr-v336-ren26a,
  title = 	 {Provable Learning of Random Hierarchy Models and Hierarchical Shallow-to-Deep Chaining},
  author =       {Ren, Yunwei and Dandi, Yatin and Krzakala, Florent and Lee, Jason D.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5556--5597},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ren26a/ren26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ren26a.html},
  abstract = 	 { The empirical success of deep learning is often attributed to deep networks’ ability to exploit hierarchical structure in data, constructing increasingly complex features across layers. Yet despite substantial progress in deep learning theory, most optimization results still focus on networks with only two or three layers, leaving the theoretical understanding of hierarchical learning in genuinely deep models limited. This leads to a natural question: can we prove that deep networks, trained with gradient-based methods and standard input-label pairs, can efficiently exploit hierarchical structure? In this work, we consider Random Hierarchy Models — a hierarchical context-free grammar introduced by Cagnetta et al. (2024) and conjectured to separate deep and shallow networks. We prove that, under mild conditions, a deep convolutional network can be efficiently trained to learn this function class. Our proof builds on a general observation: if intermediate layers can receive clean signal from the labels and the relevant features are weakly identifiable, then layerwise training each individual layer suffices to hierarchically learn the target function. }
}


@InProceedings{pmlr-v336-robertson26a,
  title = 	 {Continuous time policy evaluation is easier with noisy dynamics},
  author =       {Robertson, Samuel and Newton, Thomas and Szepesv{\'a}ri, Csaba},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5598--5624},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/robertson26a/robertson26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/robertson26a.html},
  abstract = 	 {In this work, we study continuous-time stochastic control problems governed by controlled stochastic differential equations with unknown dynamics. We focus on the discounted infinite-horizon setting and restrict attention to feedback controllers. In general, the continuous time value function is the solution to the nonlinear Hamilton-Jacobi-Bellman (HJB) equation, which typical only admits viscosity solutions with no regularity. Our first contribution is to establish sharp regularity results for value functions using elliptic partial differential equation theory. Under mild growth and regularity assumptions on the controlled dynamics and a uniform ellipticity condition on the diffusion, we show that the value function belongs to a Matérn reproducing kernel Hilbert space (RKHS) that is strictly smoother than the running reward. Building on this analysis, we develop a kernel-based policy evaluation method that estimates value functions directly from online trajectory rollouts of a fixed policy. The resulting algorithm exploits the RKHS structure with a kernel ridge regression technique, reducing the infinite-dimensional learning problem to a finite-dimensional one. Our results establish a direct connection between stochastic control, elliptic regularity theory, and kernel methods, and provide a foundation for online policy evaluation and policy improvement in continuous time.}
}


@InProceedings{pmlr-v336-eaton26a,
  title = 	 {Model Agreement via Anchoring},
  author =       {Eaton, Eric and Goel, Surbhi and Hussing, Marcel and Kearns, Michael and Roth, Aaron and Sengupta, Sikata Bela and Sorrell, Jessica},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5625--5661},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/eaton26a/eaton26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/eaton26a.html},
  abstract = 	 {Numerous lines of work aim to control \emph{model disagreement} — the extent to which two machine learning models disagree in their predictions. We adopt a simple and standard notion of model disagreement in real-valued prediction problems, namely the expected squared difference in predictions between two models trained on independent samples, without any coordination of the training processes. We would like to be able to drive disagreement to zero with some natural parameter(s) of the training procedure using analyses that can be  applied to existing training methodologies. We develop a simple general technique for proving bounds on independent model disagreement  based on \emph{anchoring} to the average of two models within the analysis. We then apply this technique to prove disagreement bounds for four commonly used machine learning algorithms: (1) stacked aggregation over an arbitrary model class (where disagreement is driven to 0 with the number of models $k$ being stacked) (2) gradient boosting (where disagreement is driven to 0 with the number of iterations $k$) (3) neural network training with architecture search (where disagreement is driven to 0 with the size $n$ of the architecture being optimized over) and (4) regression tree training over all  regression trees of fixed depth (where disagreement is driven to 0 with the depth $d$ of the tree architecture).  For clarity, we work out our initial bounds in the setting of one-dimensional regression with squared error loss — but then show that all of our results generalize to multi-dimensional regression with any strongly convex loss. }
}


@InProceedings{pmlr-v336-rubinstein26a,
  title = 	 {Private Linear Regression via a Down-Sensitivity to Privacy Reduction},
  author =       {Rubinstein, Ittai and Ge, Chris and Hopkins, Samuel B.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5662--5720},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/rubinstein26a/rubinstein26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/rubinstein26a.html},
  abstract = 	 {We present a sample- and time-efficient $(\varepsilon,\delta)$-differentially private (DP) algorithm for $d$-dimensional linear regression with a sample complexity of \[ n_{\mathrm{STAR}} = \widetilde{O}\left(\frac{d}{\alpha^2} + \frac{d \log(1/\delta)}{\alpha \varepsilon} + \frac{d \log(1/\delta)}{\varepsilon}\right) + o(d). \]{This} improves upon prior polynomial-time algorithms whose sample complexity either depends on the condition number of the design matrix $\kappa$ (for DP-SGD with gradient clipping), scales quadratically with the dimension (for Sum-of-Squares algorithms) or with the inverse of the privacy parameter (for outlier removal algorithms such as insufficient statistics perturbation or ISSP), \[ n_{\mathrm{SoS}} = \widetilde{\Omega}\left(\frac{d^2}{\alpha^2}\right), \quad n_{\mathrm{DP\mbox{-}SGD}} = \widetilde{\Omega}\left(\frac{d \sqrt{\kappa}}{\varepsilon}\right), \quad n_{\mathrm{ISSP}} = \widetilde{\Omega}\left(\frac{d}{\varepsilon^2}\right). \]{Our} algorithm is based on a novel \emph{subsample-test-aggregate} (STA) approach for ensuring privacy given only bounded \emph{down-sensitivity} – robustness to removal, but not addition, of a small number of samples. The intuition that down-sensitivity should be related to privacy is not new, but STA formalizes this by providing an \emph{efficient black-box reduction from down-sensitivity to privacy} which we expect to be applicable beyond the setting of linear regression.}
}


@InProceedings{pmlr-v336-safran26a,
  title = 	 {A Depth Hierarchy for Computing the Maximum in ReLU Networks via Extremal Graph Theory},
  author =       {Safran, Itay},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5721--5742},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/safran26a/safran26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/safran26a.html},
  abstract = 	 {We consider the problem of exact computation of the maximum function over $d$ real inputs using ReLU neural networks. We prove a depth hierarchy, wherein width $\Omega\big(d^{1+\frac{1}{2^{k-2}-1}}\big)$ is necessary to represent the maximum for any depth $3\le k\le \log_2(\log_2(d))$. This is the first unconditional super-linear lower bound for this fundamental operator at depths $k\ge3$, and it holds even if the depth scales with $d$. Our proof technique is based on a combinatorial argument and associates the non-differentiable ridges of the maximum with cliques in a graph induced by the first hidden layer of the computing network, utilizing Turán’s theorem from extremal graph theory to show that a sufficiently narrow network cannot capture the non-linearities of the maximum. This suggests that despite its simple nature, the maximum function possesses an inherent complexity that stems from the geometric structure of its non-differentiable hyperplanes, and provides a novel approach for proving lower bounds for deep neural networks.}
}


@InProceedings{pmlr-v336-schliserman26a,
  title = 	 {Convergence of Continual Learning in Homogeneous Deep Networks},
  author =       {Schliserman, Matan and Buzaglo, Gon and Evron, Itay and Soudry, Daniel},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5743--5784},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/schliserman26a/schliserman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/schliserman26a.html},
  abstract = 	 {We characterize weakly regularized continual classification in homogeneous models as sequential projections onto task margin sets. This result generalizes prior analyses restricted to either stationary (single-task) deep models or continual linear models.  We show that global convergence generally fails, even for simple models linear in data but nonlinear in parameters.  Nevertheless, by leveraging results from nonconvex projection theory, we identify regularity properties of homogeneous deep networks that guarantee local linear convergence under random and cyclic task sequences. Finally, we extend our analysis to continual regression, unifying the framework for homogeneous models.}
}


@InProceedings{pmlr-v336-schlisselberg26a,
  title = 	 {The Hidden Cost of Approximation in Online Mirror Descent},
  author =       {Schlisselberg, Ofir and Sherman, Uri and Koren, Tomer and Mansour, Yishay},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5785--5827},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/schlisselberg26a/schlisselberg26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/schlisselberg26a.html},
  abstract = 	 {Online mirror descent (OMD) is a fundamental algorithmic paradigm that underlies many algorithms in optimization, machine learning and sequential decision-making. The OMD iterates are defined as solutions to optimization subproblems which, oftentimes, can be solved only approximately, leading to an \emph{inexact} version of the algorithm. Nonetheless, existing OMD analyses typically assume an idealized error free setting, thereby limiting our understanding of performance guarantees that should be expected in practice. In this work we initiate a systematic study into inexact OMD, and uncover an intricate relation between regularizer smoothness and robustness to approximation errors. When the regularizer is uniformly smooth, we establish a tight bound on the excess regret due to errors. Then, for barrier regularizers over the simplex and its subsets, we identify a sharp separation: negative entropy requires exponentially small errors to avoid linear regret, whereas log-barrier and Tsallis regularizers remain robust even when the errors are only polynomial. Finally, we show that when the losses are stochastic and the domain is the simplex, negative entropy regains robustness - but this property does not extend to all subsets, where exponentially small errors are again necessary to avoid suboptimal regret.}
}


@InProceedings{pmlr-v336-seyfried26a,
  title = 	 {Optimal Sample Complexity Lower Bounds on Conditional Independence Testing},
  author =       {Seyfried, Jan and Mishra, Neelkanth and Sen, Sayantan and Tomamichel, Marco},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5828--5873},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/seyfried26a/seyfried26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/seyfried26a.html},
  abstract = 	 {We study the sample complexity of conditional independence testing. In this problem, given i.i.d. samples from a discrete distribution $P_{ABC}$, the goal is to distinguish whether $A$ and $C$ are conditionally independent with respect to $B$, i.e., $P_{ABC}=P_{A|B}P_BP_{C|B}$, or whether $A$ and $C$ are conditionally dependent, $\Delta(P_{ABC},P_{A|B}P_BP_{C|B})\geq \varepsilon$ for some fixed threshold $\varepsilon$ and distance measure $\Delta$. We are interested in the cases where $\Delta$ is either the $\ell_1$ distance or the KL-divergence. The study for the case of $\ell_1$ distance was initiated by (Canonne et al., STOC 2018), and the KL-divergence was recently studied by (Seyfried et al., COLT 2025). Both works design algorithms whose sample complexities scale sublinearly in the dimensions of the subsystems, and showed tight lower bounds in some parameter regimes. While Canonne et al. derived partial lower bounds for the remaining regimes as well, the problem of fully resolving the sample complexity in all parameters remained open. In this work, we settle these open questions and prove optimal sample complexity lower bounds for both of these problems, thereby completely settling the sample complexities up to polylogarithmic factors.}
}


@InProceedings{pmlr-v336-silber26a,
  title = 	 {Testing for a Hidden Geometry in Random Graphs},
  author =       {Silber, Amit and Oren-Loberman, Mor and Huleihel, Wasim},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5874--5927},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/silber26a/silber26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/silber26a.html},
  abstract = 	 {In this work, we investigate the fundamental problem of detecting a faint geometric signal hidden within an otherwise random graph. We formulate this task as a hypothesis testing problem: under the null hypothesis, the observed graph is an Erdős–Rényi random graph $\mathcal{G}(n,q)$ with edge density $q\in(0,1)$; under the alternative, a high-dimensional geometric structure is clandestinely embedded. Specifically, a random geometric graph $\mathcal{G}(k,q,d)$ on $k\le n$ vertices is planted inside $\mathcal{G}(n,q)$, where each of the $k$ vertices corresponds to an independent random point drawn uniformly from the unit sphere $\mathbb{S}^{d-1}$, and edges are formed according to latent proximity, resulting in the same edge probability $q$. Our objective is to characterize the limits of detectability of this hidden geometry, from both statistical and computational perspectives. We derive sharp information-theoretic lower bounds that characterize the regimes in which detection is fundamentally impossible, expressed explicitly in terms of the problem parameters. Complementing these impossibility results, we propose and analyze several algorithms that provably attain these limits whenever detection is feasible. We also explore the algorithmic landscape of the problem and investigate which regimes admit efficient, polynomial-time testing procedures. As in many other structured high-dimensional inference problems, our model exhibits a pronounced \emph{easy–hard–impossible} phase transition: there exist regimes in which detection is statistically possible yet computationally prohibitive, as well as regimes in which detection is impossible even with unbounded computational resources. As concrete evidence of this computational barrier, we show that the entire class of low-degree polynomial algorithms fails in the conjecturally hard regime, highlighting a sharp separation between statistical possibility and algorithmic feasibility.}
}


@InProceedings{pmlr-v336-smedira26a,
  title = 	 {Finite Sample Bounds for Learning with Score Matching},
  author =       {Smedira, Devin and Jayakumar, Abhijith and Misra, Sidhant and Vuffray, Marc and Lokhov, Andrey Y.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5928--5949},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/smedira26a/smedira26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/smedira26a.html},
  abstract = 	 {Learning of continuous exponential family distributions with unbounded support remains an important area of research for both theory and applications in high-dimensional statistics. In recent years, score matching has become a widely used method for learning exponential families with continuous variables due to its computational ease when compared against maximum likelihood estimation. However, theoretical understanding of the statistical properties of score matching is still lacking. In this work, we provide a non-asymptotic sample complexity analysis for learning the structure of exponential families of polynomials with score matching. The derived sample bounds show a polynomial dependence on the model dimension. These bounds are the first of its kind, as all prior work has shown only asymptotic bounds on the sample complexity. }
}


@InProceedings{pmlr-v336-soleymani26a,
  title = 	 {Efficient Learning and Symmetry Discovery under Exact Invariances},
  author =       {Soleymani, Ashkan and Tahmasebi, Behrooz and Jaillet, Patrick and Jegelka, Stefanie},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5950--5979},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/soleymani26a/soleymani26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/soleymani26a.html},
  abstract = 	 {Learning with group invariances is central to many scientific and geometric learning problems, yet its computational foundations remain poorly understood. Even for classical supervised regression settings, it has been unclear whether one can efficiently compute a regression function that is \emph{exactly invariant} to a given group action. Recent work showed that exact invariance can be enforced in polynomial time when the underlying group is finite and known, but left open the cases of infinite groups and unknown symmetries. In this paper, we resolve both challenges. First, we present the first polynomial-time algorithm for learning with exact group invariances that applies uniformly to finite and infinite groups. The runtime is polynomial in the data dimension and sample size, and independent of the group, while achieving strong generalization guarantees. This provides a computational explanation for the empirical success of invariant and equivariant methods in geometric machine learning and partially answers a recent open question in the literature. Second, we study learning in the \emph{symmetry discovery} setting, where the invariance group is unknown. Focusing on the subgroup lattice of a finite group, we show that exact symmetries can be identified from data and exploited for learning in polynomial time. For regression over finite-dimensional feature spaces, our algorithm provably recovers the underlying symmetry, matches the minimax-optimal sample complexity of the known-symmetry setting, and runs in time polynomial in the data dimension and sample size. Our analysis relies on tools from random Cayley graphs and expander theory, which may be of independent interest. }
}


@InProceedings{pmlr-v336-sriraman26a,
  title = 	 {Revisiting the (Sub)Optimality of Best-of-N for Inference-Time Alignment},
  author =       {Sriraman, Ved and Block, Adam},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {5980--6028},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/sriraman26a/sriraman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/sriraman26a.html},
  abstract = 	 {Best-of-$N$ (BoN) sampling is a widely used inference-time alignment method for language models, whereby $N$ candidate responses are sampled from a reference model and the one with the highest predicted reward according to a learned reward model is selected.  Despite its widespread practical use, recent theoretical work has suggested that it is statistically suboptimal and vulnerable to reward hacking, the process by which models exploit weaknesses in the learned reward model to achieve high estimated reward without genuinely improving performance. We revisit this question under assumptions that more closely reflect practice than that of prior work.  In particular, in contradistinction to earlier analyses that focused on expected true reward, which may not be meaningful in many practical settings, we investigate how inference-time alignment affects the \emph{win-rate}, a pairwise comparison-based metric more closely aligned with how reward models are trained and evaluated in practice.  We demonstrate that, under minimal conditions on the quality of the reference model and learned reward model, properly tuned BoN is both computationally and statistically optimal in achieving high win-rate, partially explaining its widespread practical success.  Because BoN remains susceptible to reward-hacking in this setting, we propose a simple and practical variant that provably eliminates reward-hacking while maintaining optimal statistical performance.  Finally, we show that prior approaches are provably \emph{suboptimal} when considering win-rate, highlighting the importance of choosing appropriate objectives when analyzing inference-time alignment methods.}
}


@InProceedings{pmlr-v336-steinke26a,
  title = 	 {Privately Estimating Black-Box Statistics},
  author =       {Steinke, G\"unter and Steinke, Thomas},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6029--6074},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/steinke26a/steinke26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/steinke26a.html},
  abstract = 	 {Standard techniques for differentially private estimation, such as Laplace or Gaussian noise addition, require guaranteed bounds on the sensitivity of the estimator in question. But such sensitivity bounds are often large or simply unknown. Thus we seek differentially private methods that can be applied to arbitrary black-box functions. A handful of such techniques exist, but all are either inefficient in their use of data or require evaluating the function on exponentially many inputs. In this work we present a scheme that trades off between statistical efficiency (i.e., how much data is needed) and oracle efficiency (i.e., the number of evaluations). We also present lower bounds showing the near-optimality of our scheme.}
}


@InProceedings{pmlr-v336-stradi26a,
  title = 	 {Truly Adapting to Adversarial Constraints in Constrained MABs},
  author =       {Stradi, Francesco Emanuele and Kalupahana, Kalana and Castiglioni, Matteo and Marchesi, Alberto and Gatti, Nicola},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6075--6113},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/stradi26a/stradi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/stradi26a.html},
  abstract = 	 {We study the constrained variant of the multi-armed bandit (MAB) problem, in which the learner aims not only at minimizing the total loss incurred during the learning dynamic, but also at controlling the violation of multiple unknown constraints, under both full and bandit feedback. We consider a non-stationary environment that subsumes both stochastic and adversarial models and where, at each round, both losses and constraints are drawn from distributions that may change arbitrarily over time. In such a setting, it is provably not possible to guarantee both sublinear regret and sublinear violation. Accordingly, prior work has mainly focused either on settings with stochastic constraints or on relaxing the benchmark with fully adversarial constraints (e.g., via competitive ratios with respect to the optimum). We provide the first algorithms that achieve optimal rates of regret and positive constraint violation when the constraints are stochastic while the losses may vary arbitrarily, and that simultaneously yield guarantees that degrade smoothly with the degree of adversariality of the constraints. Specifically, under full feedback we propose an algorithm attaining $\widetilde{\mathcal{O}}(\sqrt{T}+C)$ regret and $\widetilde{\mathcal{O}}(\sqrt{T}+C)$ positive violation, where $C$ quantifies the amount of non-stationarity in the constraints. We then show how to extend these guarantees when only bandit feedback is available for the losses. Finally, when bandit feedback is available for the constraints, we design an algorithm achieving $\widetilde{\mathcal{O}}(\sqrt{T}+C)$ positive violation and $\widetilde{\mathcal{O}}(\sqrt{T}+C\sqrt{T})$ regret.}
}


@InProceedings{pmlr-v336-tahmasebi26a,
  title = 	 {Data Augmentation: A Fourier Analysis Perspective},
  author =       {Tahmasebi, Behrooz and Weber, Melanie and Jegelka, Stefanie},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6114--6155},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/tahmasebi26a/tahmasebi26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/tahmasebi26a.html},
  abstract = 	 {Data augmentation is a simple and model-agnostic approach for exploiting known invariances in learning problems. Given a group acting on the input space, one augments the training set with transformed copies of each sample.  Because it exploits symmetries without modifying the underlying learning algorithm, data augmentation can be applied broadly across learning methods. However, this universality comes at a computational cost: when the group is large, full group-sized augmentation quickly becomes computationally infeasible. This raises a fundamental question: \emph{Can partial data augmentation achieve the same statistical benefits as full augmentation in terms of generalization and sample complexity?} We develop a general framework for investigating this question using Fourier analysis and the representation theory of finite groups. We show that, for a broad class of classical learning problems, partial data augmentation based on a randomly sampled subset of group elements achieves the same minimax rates as full augmentation, up to an approximation error that vanishes as the subset size increases. Our results provide a theoretical explanation for why partial augmentation can retain the statistical benefits of full augmentation despite enforcing symmetry only approximately, and shed light on a recently raised question in learning with symmetries: whether statistically optimal learning under general group invariances can be achieved using computationally scalable methods. Moreover, we prove a complementary impossibility result: enforcing \emph{exact} invariance via data augmentation requires averaging over the entire group, and cannot be achieved by any strict subset when the hypothesis space is sufficiently expressive. Together, these results provide a unified perspective on full and partial data augmentation, as well as exact and approximate symmetry enforcement.}
}


@InProceedings{pmlr-v336-thurin26a,
  title = 	 {CONVERGENCE RATES FOR DISTRIBUTION MATCHING WITH SLICED OPTIMAL TRANSPORT},
  author =       {Thurin, Gauthier and Boyer, Claire and Nadjahi, Kimia},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6156--6196},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/thurin26a/thurin26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/thurin26a.html},
  abstract = 	 { We study the slice-matching scheme, an efficient iterative method for distribution matching based on sliced optimal transport. We investigate convergence to the target distribution and derive quantitative non-asymptotic rates. To this end, we establish Lojasiewicz-type inequalities for the Sliced-Wasserstein objective. A key challenge is to control along the trajectory the constants in these inequalities. We show that this becomes tractable for Gaussian distributions.  Specifically,  eigenvalues are controlled when matching along random orthonormal bases at each iteration.  We complement our theory with numerical experiments and illustrate the predicted dependence on dimension and step-size, as well as the stabilizing effect of orthonormal-basis sampling.}
}


@InProceedings{pmlr-v336-tinati26a,
  title = 	 {On the Asymptotics of Self-Supervised Pre-training: Two-Stage M-Estimation and Representation Symmetry},
  author =       {Tinati, Mohammad and Tu, Stephen},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6197--6309},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/tinati26a/tinati26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/tinati26a.html},
  abstract = 	 {Self-supervised pre-training, where large corpora of unlabeled data are used to learn representations for downstream fine-tuning, has become a cornerstone of modern machine learning. While a growing body of work has begun to analyze this paradigm, existing bounds leave open the question of how sharp current rates are, and whether they accurately capture the complex interaction between pre-training and fine-tuning. In this paper, we address this gap by developing an asymptotic theory of pre-training via two-stage $M$-estimation. A key challenge is that the pre-training estimator is often identifiable only up to a group symmetry, a feature common in representation learning that requires careful treatment. We address this issue using tools from Riemannian geometry to study the \emph{intrinsic} parameters of the pre-training representation, which we link with the downstream predictor through a notion of \emph{orbit-invariance}, precisely characterizing the limiting distribution of the downstream test risk. We apply our results to spectral pre-training, factor models, and Gaussian mixture models, obtaining substantial improvements in problem-specific factors over prior art when applicable.}
}


@InProceedings{pmlr-v336-tinaz26a,
  title = 	 {When Both Layers Learn: Training Dynamics of Representing Linear Models via ReLU Networks},
  author =       {Tinaz, Berk and Xie, Changzhi and Soltanolkotabi, Mahdi},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6310--6371},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/tinaz26a/tinaz26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/tinaz26a.html},
  abstract = 	 {In this paper, we study the gradient descent dynamics for jointly training both layers of a one-hidden-layer ReLU network to fit a linear target function. Concretely, we consider a realizable setting where inputs are drawn i.i.d. from a Gaussian distribution and labels follow a planted linear model. This stylized framework captures salient features of end-to-end training in inverse problems and certain auto-encoder models. Despite its apparent simplicity, the dynamics remain poorly understood, in part because the loss landscape contains multiple non-strict saddle points, making it unclear why gradient descent from random initialization reliably escapes bad stationary regions. We provide a detailed characterization of the optimization landscape and prove that gradient descent from a moderately small random initialization-simultaneously training both layers-converges to a global minimizer at a linear rate with order-wise optimal sample complexity. Our analysis tracks the trajectory through three phases: an alignment phase in which hidden weights progressively align with the planted direction while the output weights maintain the correct sign pattern; a growth phase in which the norms of both layers increase while preserving alignment; and a local refinement phase in which the aligned neurons rapidly converge to the planted direction, yielding fast local convergence. To rigorously show that GD avoids non-strict saddles, we develop trajectory-level control arguments for the end-to-end dynamics. In addition, we establish novel uniform concentration results that hold along the entire trajectory, and are essential for obtaining order-wise optimal sample complexity. We corroborate our theory with extensive experiments across a range of configurations.}
}


@InProceedings{pmlr-v336-tkachuk26a,
  title = 	 {Trajectory Data Suffices for Statistically Efficient Policy Evaluation in Fixed-Horizon Offline RL with Linear $q^\pi$-Realizability and Concentrability},
  author =       {Tkachuk, Volodymyr and Szepesv\'ari, Csaba and Tan, Xiaoqi},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6372--6405},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/tkachuk26a/tkachuk26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/tkachuk26a.html},
  abstract = 	 {We study fixed-horizon offline reinforcement learning (RL) with function approximation for both policy evaluation and policy optimization. Prior work established that statistically efficient learning is impossible for either of these problems when the only assumptions are that the data has good coverage (concentrability) and the state-action value function of every policy is linearly realizable ($q^\pi$-realizability) [Foster et al., 2022]. Recently, Tkachuk et al. [2024] gave a statistically efficient learner for policy optimization, if in addition the data is assumed to be given as trajectories. In this work we present a statistically efficient learner for policy evaluation under the same assumptions, with the additional requirement that the behavior policy is known. Further, we show that the sample complexity of the learner used by Tkachuk et al. [2024] for policy optimization can be improved by a tighter analysis.  }
}


@InProceedings{pmlr-v336-tsirkas26a,
  title = 	 {The Monotonicity of the Franz–Parisi Potential Is Equivalent to Low-Degree {MMSE} Lower Bounds: Extended Abstract},
  author =       {Tsirkas, Konstantinos and Wang, Leda and Zadik, Ilias},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6406--6409},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/tsirkas26a/tsirkas26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/tsirkas26a.html},
  abstract = 	 {Over the last decades, two distinct approaches have been instrumental to our understanding of the computational complexity of statistical estimation. The statistical physics literature predicts algorithmic hardness through local stability and monotonicity properties of the Franz–Parisi potential, while the rigorous average-case complexity literature characterizes hardness via the limitations of restricted algorithmic classes, most notably low-degree polynomial estimators. In this work, we show that for estimation problems the power of low-degree polynomials is governed by the monotonicity of the annealed Franz–Parisi potential for a broad family of Gaussian additive models. Subject to the low-degree conjecture for these Gaussian additive models, this identifies the polynomial-time estimation threshold with the monotonicity threshold of the annealed Franz–Parisi potential.}
}


@InProceedings{pmlr-v336-van-der-poel26a,
  title = 	 {Spectral Recovery of a Planted Triangle-Dense Subgraph},
  author =       {{van der Poel}, Sam and Mao, Cheng and McKenna, Benjamin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6410--6457},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/van-der-poel26a/van-der-poel26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/van-der-poel26a.html},
  abstract = 	 {Given a simple graph on $n$ vertices and a parameter $k$, the triangle-densest-$k$-subgraph problem is known to be computationally hard in the worst case. To circumvent the computational hardness, we study an average-case model where a triangle-dense subgraph on $k$ vertices is planted in an Erdős–Rényi random graph on $n$ vertices. For the recovery of the planted subgraph, we propose a simple spectral algorithm and a semidefinite program, both of which use a graph matrix whose entries are local signed triangle counts. Theoretical guarantees for these algorithms are established through spectral analysis of the graph matrix. Finally, we provide evidence showing a statistical-to-computational gap analogous to that for the planted clique problem. The computational threshold in terms of the subgraph size $k$ is at least $\sqrt{n}$ in the framework of low-degree polynomial algorithms, while the information-theoretic threshold is at most logarithmic in $n$.}
}


@InProceedings{pmlr-v336-vary26a,
  title = 	 {{On-Average Stability of Multipass Preconditioned SGD and Effective Dimension}},
  author =       {Vary, Simon and Farghly, Tyler and Kuzborskij, Ilja and Rebeschini, Patrick},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6458--6495},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/vary26a/vary26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/vary26a.html},
  abstract = 	 {We study trade-offs between the population risk curvature, geometry of the noise, and preconditioning on the generalisation ability of the multipass Preconditioned Stochastic Gradient Descent (PSGD). Many practical optimisation heuristics implicitly navigate this trade-off in different ways — for instance, some aim to whiten gradient noise, while others aim to align updates with expected loss curvature. When the geometry of the population risk curvature and the geometry of the gradient noise do not match, an aggressive choice that improves one aspect can amplify instability along the other, leading to suboptimal statistical behavior. In this paper we employ \emph{on-average algorithmic stability} to connect generalisation of PSGD to the \emph{effective dimension} that depends on these sources of curvature. While existing techniques for on-average stability of SGD are limited to a single pass, as first contribution we develop a new on-average stability analysis for multipass SGD that handles the correlations induced by data reuse. This allows us to derive excess risk bounds that depend on the effective dimension. In particular, we show that an improperly chosen preconditioner can yield suboptimal effective dimension dependence in both optimisation and generalisation. Finally, we complement our upper bounds with matching, instance-dependent lower bounds.}
}


@InProceedings{pmlr-v336-vempala26a,
  title = 	 {The Geometry of Efficient Nonconvex Sampling},
  author =       {Vempala, Santosh S. and Wibisono, Andre},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6496--6532},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/vempala26a/vempala26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/vempala26a.html},
  abstract = 	 {We present an efficient algorithm for uniformly sampling from an arbitrary compact body $\mathcal{X} \subset \mathbb{R}^n$ from a warm start under isoperimetry and a natural volume growth condition. Our result provides a substantial common generalization of known results for convex bodies and star-shaped bodies. The complexity of the algorithm is polynomial in the dimension, the Poincar{é} constant of the uniform distribution on $\mathcal{X}$ and the volume growth constant of the set $\mathcal{X}$.}
}


@InProceedings{pmlr-v336-voitovych26a,
  title = 	 {Learning with Simulators: No Regret in a Computationally Bounded World},
  author =       {Voitovych, Sasha and Shetty, Abhishek and Golowich, Noah and Rakhlin, Alexander},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6533--6591},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/voitovych26a/voitovych26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/voitovych26a.html},
  abstract = 	 {Understanding the minimal assumptions necessary for generalization is the fundamental question in learning theory. Unfortunately, most results rely heavily on independence (or some proxy thereof) of the data-generating process, while results for strongly dependent data are far more limited. Towards addressing this gap, we introduce the framework of simulatable processes, where the learner has access to a simulator that approximates the distribution generating the data (which may be an arbitrarily complex and dependent process). Surprisingly, given access to such a simulator, we show that we can recover the same learning guarantees as in the classical setting with independent data, namely, error bounds that depend on the VC dimension. Further, we use this framework to study the power of conditional sampling and show strict statistical and computational advantages in this setting. As a highlight of our framework, we exhibit a single algorithm that simultaneously learns any given VC class under all processes samplable in bounded polynomial time, with regret controlled by the time-bounded Kolmogorov complexity of the process. This provides a significant conceptual broadening of the classical PAC model.}
}


@InProceedings{pmlr-v336-wainwright26a,
  title = 	 {Fast Score-Based Sampling via Log-Concave Reductions},
  author =       {Wainwright, Martin J.},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6592--6621},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/wainwright26a/wainwright26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/wainwright26a.html},
  abstract = 	 {Sampling based on score diffusions has led to striking empirical results, and has attracted considerable attention from various research communities.  It depends on the availability of (approximate) Stein score functions for various levels of additive noise.  We show how, in some generality, the availability of scores allows the general problem to be “reduced” to sampling from an adaptively constructed sequence of $K$ strongly log-concave (SLC) sub-problems.  The reduction is simple, constructive and algorithm-independent, so that any SLC sampler can be used as a subroutine.  Various bounds on score-based sampling complexity follow directly: for instance, high-accuracy SLC samplers yield $\tilde{O}(\sqrt{d} \operatorname{polylog}(1/\varepsilon))$ guarantees for accuracy $\varepsilon$ in dimension $d$, whereas randomized midpoint SLC schemes yield $\tilde{O}( d^{1/3} \operatorname{poly}(1/\varepsilon))$ guarantees.  When the original distribution itself is SLC, we prove that $K \leq 1 + \log_2(\kappa)$, thereby obtaining the first efficient procedure with logarithmic dependence on the condition number $\kappa$; for general distributions, the quantity $K$ depends on the geometry of the score Hessian across the trajectory.  Our analysis is direct and simple, involving techniques and insights complementary to those in standard analyses of discretized diffusions.}
}


@InProceedings{pmlr-v336-wang26a,
  title = 	 {Almost sure null bankruptcy of testing-by-betting strategies},
  author =       {Wang, Hongjian and Agrawal, Shubhada and Ramdas, Aaditya},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6622--6650},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/wang26a/wang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/wang26a.html},
  abstract = 	 {The bounded mean betting procedure serves as a crucial interface between the domains of (1) sequential, anytime-valid statistical inference, and (2) online learning and portfolio selection algorithms. While recent work in both domains has established the exponential wealth growth of numerous betting strategies under any alternative distribution, the tightness of the inverted confidence sets, and the pathwise minimax regret bounds, little has been studied regarding the asymptotics of these strategies under the null hypothesis. Under the null, a strategy  induces a wealth martingale converging to some random variable that can be zero (bankrupt) or non-zero (non-bankrupt, e.g. when it eventually stops betting).  In this paper, we show the conceptually intuitive but technically nontrivial fact that these strategies (universal portfolio, Krichevsky-Trofimov, GRAPA, hedging, etc.) all go bankrupt with probability one, under any non-degenerate null distribution. Part of our analysis is based on the subtle almost sure divergence of various sums of $\sum_n O_p(n^{-1})$ type,  a result of independent interest. We also demonstrate the necessity of null bankruptcy by showing that non-bankrupt strategies are all improvable in some sense. Our results significantly deepen our understanding of these betting strategies as they qualify their behavior on “almost all paths”, whereas previous results are usually on “all paths” (e.g. regret bounds) or “most paths” (e.g. concentration inequalities and confidence sets).}
}


@InProceedings{pmlr-v336-wang26b,
  title = 	 {A simple, optimal and efficient algorithm for online exp-concave optimization},
  author =       {Wang, Yi-Han and Zhao, Peng and Zhou, Zhi-Hua},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6651--6691},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/wang26b/wang26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/wang26b.html},
  abstract = 	 { Online eXp-concave Optimization (OXO) is a fundamental problem in online learning, where the goal is to minimize regret when loss functions are exponentially concave. The standard algorithm, Online Newton Step (ONS), guarantees an optimal $O(d \log T)$ regret, where $d$ is the dimension and $T$ is the time horizon. Despite its simplicity, ONS may face a computational bottleneck due to the \emph{Mahalanobis projection} at each round. This step costs $\Omega(d^\omega)$ arithmetic operations for bounded domains, even for simple domains such as the unit ball, where $\omega \in (2,3]$ is the matrix-multiplication exponent. As a result, the total runtime can reach $\tilde{O}(d^\omega T)$, particularly when iterates frequently oscillate near the domain boundary. This paper proposes a simple variant of ONS, called LightONS, which reduces the total runtime to $O(d^2 T + d^\omega \sqrt{T \log T})$ while preserving the optimal regret. Deploying LightONS with the online-to-batch conversion implies a method for stochastic exp-concave optimization with runtime $\tilde{O}(d^3/\varepsilon)$, thereby answering an open problem posed by Koren [2013]. The design leverages domain-conversion techniques from parameter-free online learning and defers expensive Mahalanobis projections until necessary, thereby preserving the elegant structure of ONS and enabling LightONS to act as an efficient plug-in replacement in broader scenarios, including gradient-norm adaptivity, parametric stochastic bandits, and memory-efficient OXO. }
}


@InProceedings{pmlr-v336-wang26c,
  title = 	 {Accelerated Convex Optimization via Hamiltonian Dynamics with Deterministic Integration Time},
  author =       {Wang, Xiuyuan and Srinivasan, Vishwak and Fu, Qiang and Mitra, Siddharth and Wibisono, Andre and Wilson, Ashia},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6692--6742},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/wang26c/wang26c.pdf},
  url = 	 {https://proceedings.mlr.press/v336/wang26c.html},
  abstract = 	 {We develop Hamiltonian dynamics-based algorithms for smooth convex optimization that achieve accelerated rates of convergence.  By exploiting contraction of averaged Hamiltonian flow trajectories rather than requiring contraction at trajectory endpoints,  we show that Hamiltonian dynamics-based optimization methods admit deterministic and accelerated convergence guarantees,  extending prior work that is limited to quadratic objectives or holds only in expectation.  We analyze an idealized continuous-time algorithm and derive practical discrete-time implementations with optimal first-order complexity,  thereby establishing Hamiltonian dynamics as a useful algorithmic primitive for deterministic accelerated optimization.}
}


@InProceedings{pmlr-v336-wang26d,
  title = 	 {Diffusion-Network Alignment: An Efficient Algorithm and Explicit Probability Bounds},
  author =       {Wang, Ziao and Ying, Lei},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6743--6810},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/wang26d/wang26d.pdf},
  url = 	 {https://proceedings.mlr.press/v336/wang26d.html},
  abstract = 	 {This paper studies a variation of the classic network alignment problem, named diffusion-network alignment. The goal is to align the vertices of a rooted diffusion tree to the vertices of a network, where the diffusion tree could be from a communication trace or contact tracing, and the network could be an online or offline social network. Different from the classic network alignment where both networks are fully observed, this model captures the information asymmetry of two networks. To solve this problem, this paper presents an efficient algorithm based on tree correlation tests to extract alignment information from local neighborhoods. We analyze the performance of the algorithm in the sparse graph regime and show that with high probability, all matched pairs are correct.  Furthermore, for each vertex on the diffusion tree, this paper establishes an explicit lower bound on the probability that the vertex is correctly matched.  These lower bounds are depth-dependent and increase as vertices get closer to the root.}
}


@InProceedings{pmlr-v336-nachum26a,
  title = 	 {Minimax Limits of $k$-Fold Cross-Validation via Majority},
  author =       {Nachum, Ido and Urbanke, Ruediger and Weinberger, Thomas},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6811--6848},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/nachum26a/nachum26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/nachum26a.html},
  abstract = 	 {We study the mean-squared error of $k$-fold cross-validation as a risk estimator, with particular emphasis on how its accuracy depends on the number of folds $k$. Despite the widespread use of cross-validation, principled guidance for choosing $k$ is largely absent, mainly due to the complex dependence between fold-wise error estimates. To obtain sharp and interpretable results, we focus on the majority algorithm in binary classification, a minimal yet nontrivial empirical risk minimization procedure. We provide a fine-grained analysis of its cross-validation behavior, showing that even this simple algorithm exhibits subtle and delicate phenomena for which existing theory provides loose and even vacuous bounds. Leveraging this analysis, we introduce a minimax framework for cross-validation risk estimation and prove that no empirical risk minimization algorithm can achieve an $O(1/n)$ minimax mean-squared error when the number of folds grows with the number of samples $n$; instead, a lower bound of order $\Omega(\sqrt{k}/n)$ is unavoidable. Our results reveal fundamental limitations of cross-validation as a data-reuse strategy, clarify gaps and inaccuracies in prior theoretical work, and position the majority algorithm as a natural benchmark that any tight analysis of cross-validation should be able to explain.}
}


@InProceedings{pmlr-v336-wu26a,
  title = 	 {Risk Comparisons in Linear Regression: Implicit Regularization Dominates Explicit Regularization (Extended Abstract)},
  author =       {Wu, Jingfeng and Bartlett, Peter L. and Kakade, Sham M. and Lee, Jason D. and Yu, Bin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6849--6851},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/wu26a/wu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/wu26a.html},
  abstract = 	 {Existing theory suggests that for linear regression problems categorized by capacity and source conditions, \emph{gradient descent} (GD) is always minimax optimal, while both \emph{ridge regression} and online \emph{stochastic gradient descent} (SGD) are polynomially suboptimal for certain categories of such problems. Moving beyond minimax theory, this work provides \emph{instance-wise} comparisons of the finite-sample risks for these algorithms on any well-specified linear regression problem. Our analysis yields three key findings. First, GD \emph{dominates} ridge regression: with comparable regularization, the excess risk of GD is \emph{always} within a constant factor of ridge, but ridge can be \emph{polynomially} worse even when tuned optimally. Second, GD is \emph{incomparable} with SGD. While it is known that for certain problems GD can be polynomially better than SGD, the reverse is also true: we construct problems, inspired by \emph{benign overfitting} theory, where optimally stopped GD is polynomially worse. Finally, GD dominates SGD for a significant subclass of problems—those with fast and continuously decaying covariance spectra—which includes all problems satisfying the standard capacity condition.}
}


@InProceedings{pmlr-v336-tianhao26a,
  title = 	 {Lyapunov-Based Sample Complexity Analysis for Weakly-Coupled MDPs (extended abstract)},
  author =       {Tianhao, Wu and Zurek, Matthew and Wang, Weina and Xie, Qiaomin},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6852--6857},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/tianhao26a/tianhao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/tianhao26a.html},
  abstract = 	 {We study the sample complexity of learning in average-reward weakly-coupled Markov decision processes (WCMDPs) and Restless Bandits (RBs) under a generative model. Naive reduction to a tabular MDP leads to high complexity bounds as the state-action space is exponentially large in the number of arms $N$. By exploiting the weakly coupled structure, we show that near-optimal policies can be learned with sample and computational complexities that are polynomial in $N$. Specifically, we analyze the plug-in approach, which applies an efficient planning algorithm to an empirical model estimated from data. For fully heterogeneous WCMDPs, we establish the first finite-sample PAC guarantee with polynomial complexity and an $O(1/\sqrt{N})$ optimality gap. For homogeneous RBs, we further prove that a smaller optimality gap is achievable under mild structural assumptions. A primary technical contribution of our work is a novel Lyapunov-based analysis framework. Unlike classical approaches that rely on the difficult-to-control bias function, our framework uses an explicitly constructed Lyapunov function along with a drift transfer technique between the true and empirical models. A key step of independent interest in our framework  is a fine-grained perturbation analysis for the underlying linear programming (LP) relaxation, which provides a general tool for analyzing LP-based policies and weakly-coupled systems.}
}


@InProceedings{pmlr-v336-xie26a,
  title = 	 {Worst-case Error Bounds for Online Learning of Smooth Functions},
  author =       {Xie, Weian (Andrew)},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6858--6884},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/xie26a/xie26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/xie26a.html},
  abstract = 	 {Online learning is a model of machine learning where the learner is trained on sequential feedback. We investigate worst-case error for the online learning of real functions that have certain smoothness constraints. Suppose that $\mathcal{F}_q$ is the class of all absolutely continuous functions $f: [0,1] \rightarrow \mathbb{R}$ such that $\|f’\|_q \le 1$, and $\mathrm{opt}_p(\mathcal{F}_q)$ is the best possible upper bound on the sum of the $p^{\mathrm{th}}$ powers of absolute prediction errors for any number of trials guaranteed by any learner. We show that for any $\delta, \epsilon \in (0,1)$, $\mathrm{opt}_{1+\delta}(\mathcal{F}_{1+\epsilon}) = O(\min(\delta,\epsilon)^{-1})$. Combined with the previous results of Kimber and Long (1995) and Geneson and Zhou (2023), we achieve a complete characterization of the values of $p, q \ge 1$ that result in $\mathrm{opt}_p(\mathcal{F}_q)$ being finite, a problem open for nearly 30 years. We study the learning scenarios of smooth functions that also belong to certain special families of functions, such as polynomials. We prove a conjecture by Geneson and Zhou (2023) that it is not any easier to learn a polynomial in $\mathcal{F}_q$ than it is to learn any general function in $\mathcal{F}_q$. We also define a noisy model for the online learning of smooth functions, where the learner may receive incorrect feedback up to $\eta \ge 1$ times, denoting the worst-case error bound as $\mathrm{opt}^{\mathrm{nf}}_{p,\eta}(\mathcal{F}_q)$. We prove that $\mathrm{opt}^{\mathrm{nf}}_{p,\eta}(\mathcal{F}_q)$ is finite if and only if $\mathrm{opt}_p(\mathcal{F}_q)$ is. Moreover, we prove for all $p, q \ge 2$ and $\eta \ge 1$ that $\mathrm{opt}^{\mathrm{nf}}_{p,\eta}(\mathcal{F}_q) = \Theta(\eta)$.}
}


@InProceedings{pmlr-v336-yan26a,
  title = 	 {Optimism Stabilizes Thompson Sampling for Adaptive Inference},
  author =       {Yan, Shunxing and Zhong, Han},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6885--6886},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/yan26a/yan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/yan26a.html},
  abstract = 	 {Thompson sampling (TS) is widely used for stochastic multi-armed bandits, yet its inferential behavior under adaptive data collection is subtle. Classical asymptotic theory for sample means can fail because arm-specific sample sizes are random, history-dependent, and coupled with the observed rewards through the action-selection rule. A useful sufficient condition for valid asymptotic inference is \emph{stability}, which requires each arm’s pull count to concentrate around a deterministic scale. While stability is now understood for several UCB-type algorithms, vanilla TS can be unstable, leading to nonstandard asymptotics and potentially invalid Wald-type confidence intervals. We identify optimism as a general mechanism for stabilizing Thompson sampling. In the $K$-armed Gaussian bandit with any fixed $K\ge2$, we study two optimistic TS variants. The first is TS with posterior variance inflation; the second keeps the posterior variance unchanged but adds an explicit optimism bonus to the posterior mean. For both variants, we prove stability: optimal arms asymptotically share the horizon uniformly, while each suboptimal arm is sampled on a sharp gap-dependent logarithmic scale. For variance-inflated TS, this resolves the open problem posed by Halder et al. (2025) by extending their two-armed stability theory to general $K$-armed bandits, including instances with multiple optimal arms. For the mean-bonus variant, our result shows that stability can also be achieved through a direct optimistic shift of the posterior center, without inflating the posterior variance. The main technical novelty lies in the treatment of variance-inflated TS with multiple optimal arms. In this regime, stability requires proving that the randomized competition among statistically indistinguishable optimal arms converges to a deterministic allocation. We isolate a limiting pure-noise competition and prove a negative-feedback property: over-sampled optimal arms become less likely to win future posterior draws, while under-sampled ones become more likely to be selected. This yields a contraction toward the uniform allocation over the optimal set. Concentration and rare-event estimates then control the perturbations caused by empirical-mean errors and occasional suboptimal selections. For the mean-bonus variant, we use a separate argument based on posterior-sampling concentration and UCB-type comparisons, since optimism enters through a deterministic shift of the posterior mean rather than through variance inflation. These stability results imply asymptotically valid adaptive inference. In particular, for either optimistic TS variant, the usual studentized sample mean is asymptotically standard normal, and standard Wald confidence intervals achieve the nominal coverage probability despite adaptive sampling. Thus, suitably implemented optimism stabilizes Thompson sampling and enables classical inference from adaptively collected bandit data, while incurring only a mild additional regret cost.}
}


@InProceedings{pmlr-v336-yang26a,
  title = 	 {Tight Sample Complexity of Transformers},
  author =       {Yang, Chenxiao and Srebro, Nathan and Li, Zhiyuan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6887--6923},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/yang26a/yang26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/yang26a.html},
  abstract = 	 {We tightly characterize the VC dimension of depth-$L$ Transformers with a total of $W$ parameters, mapping an input sequence of length $T$ to a single output, establishing an upper bound of $O(L W \log (T W))$ and a nearly matching lower bound of $\Omega(L W \log (T W / L))$. We further tightly characterize the sample complexity of chain-of-thought learning using such a Transformer, showing teacher forcing (i.e. selecting a predictor consistent with the entire chain-of-thought on training data) learns with sample complexity $O\left(L W \log \left(\left(T+T^{\prime}\right) W\right)\right)$ and that any learning rule that uses chain-of-thought data requires at least $\Omega\left(L W \log \left(\left(T+T^{\prime}\right) W / L\right)\right)$ examples, where $T$ is the input length and $T^{\prime}$ is the number of autoregressive steps.}
}


@InProceedings{pmlr-v336-ye26a,
  title = 	 {Learning Decision-Sufficient Representations for Linear Optimization},
  author =       {Ye, Yuhan and Amin, Saurabh and {\"O}zda{\u{g}}lar, Asuman},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6924--6975},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/ye26a/ye26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/ye26a.html},
  abstract = 	 {We study how to construct compressed datasets that suffice to recover optimal decisions in linear programs with unknown cost vector $c$ lying in a prior set $\mathcal{C}$. Recent work by Bennouna et al. (2025a) provides an exact geometric characterization of sufficient decision datasets (SDDs) via an intrinsic decision-relevant dimension $d^\star$. However, their algorithm for constructing minimum-size SDDs requires solving mixed-integer programs. In this paper, we establish hardness results: computing $d^\star$ is NP-hard and deciding whether a dataset is globally sufficient is coNP-hard, thereby resolving the open problem posed by Bennouna et al. (2026). To circumvent worst-case intractability, we introduce pointwise sufficiency, a relaxation that requires sufficiency for an individual cost vector. We provide a polynomial-time cutting-plane algorithm to construct pointwise-sufficient decision datasets under nondegeneracy. In a data-driven regime with i.i.d. costs, we propose a cumulative algorithm that aggregates decision-relevant directions across samples, yielding a stable compression scheme of size at most $d^\star$. This leads to a distribution-free PAC guarantee: with high probability over the training sample, the pointwise sufficiency failure probability on a fresh draw is at most $\tilde{O}(d^\star/n)$, and this rate is tight up to logarithmic factors. Finally, we apply decision-sufficient representations to contextual linear optimization, obtaining compressed predictors with generalization bounds scaling as $\tilde{O}(\sqrt{d^\star/n})$ rather than $\tilde{O}(\sqrt{d/n})$, where $d$ is the ambient cost dimension.}
}


@InProceedings{pmlr-v336-yu26a,
  title = 	 {Distribution-Free Sequential Prediction with Abstentions},
  author =       {Yu, Jialin and Blanchard, Mo\"ise},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {6976--7011},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/yu26a/yu26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/yu26a.html},
  abstract = 	 {We study a sequential prediction problem in which an adversary is allowed to inject arbitrarily many adversarial instances in a stream of i.i.d. instances, but at each round, the learner may also \emph{abstain} from making a prediction without incurring any penalty if the instance was indeed corrupted. This semi-adversarial setting naturally sits between the classical stochastic case with i.i.d. instances for which function classes with finite VC dimension are learnable; and the adversarial case with arbitrary instances, known to be significantly more restrictive. For this problem, Goel et al. (2023) showed that, if the learner knows the distribution $\mu$  of clean samples in advance, learning can be achieved for all VC classes without restrictions on adversary corruptions. This is, however, a strong assumption in both theory and practice: a natural question is whether similar learning guarantees can be achieved without prior distributional knowledge, as is standard in classical learning frameworks (e.g., PAC learning or asymptotic consistency) and other non-i.i.d. models (e.g., smoothed online learning).  We therefore focus on the distribution-free setting where $\mu$ is \emph{unknown} and propose an algorithm \textsc{AbstainBoost} based on a boosting procedure of weak learners, which guarantees sublinear error for general VC classes in \emph{distribution-free} abstention learning for oblivious adversaries. These algorithms also enjoy similar guarantees for adaptive adversaries, for structured function classes including linear classifiers. These results are complemented with corresponding lower bounds, which reveal an interesting polynomial trade-off between misclassification error and number of erroneous abstentions.}
}


@InProceedings{pmlr-v336-yu26b,
  title = 	 {Stable algorithms Lower Bounds for Estimation from MMSE Discontinuities: Extended Abstract},
  author =       {Yu, Xifan and Zadik, Ilias},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7012--7015},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/yu26b/yu26b.pdf},
  url = 	 {https://proceedings.mlr.press/v336/yu26b.html},
  abstract = 	 {Recent works in average-case complexity have identified stable (noise-stable) algorithms as a central class. Specifically, in average-case optimization, the class is conjectured to capture the power of polynomial-time computation for many problems. This perspective has been supported by establishing variants of the Overlap Gap Property (OGP) phase transitions just below conjectured polynomial-time thresholds. Yet, it was recently challenged by Schramm and Li (2025), who showed that Shortest Path in random graphs exhibits the OGP—and hence all stable algorithms fail—despite being solvable in polynomial time. This counterexample has also been particularly curious as it appeared rather distinct from other classical “noiseless" counterexamples, such as solving random linear systems. By contrast, the power of stable methods in statistical estimation has remained unclear. A central difficulty is the absence of an OGP-type phenomenon that can uniformly exclude all stable methods. Instead, existing lower bounds largely focus on the related class of low-degree polynomials and are confined to restricted models, such as Gaussian additive models, reflecting the high technical difficulty of controlling the minimum mean-squared error (MMSE) of low-degree estimators. In this work, we show that for all statistical estimation problems, a natural MMSE instability (discontinuity) condition implies the failure of stable algorithms, serving as a version of OGP for estimation tasks. Using this criterion, we establish separations between stable and polynomial-time algorithms for the following MMSE-unstable tasks (i) Planted Shortest Path, where Dijkstra’s algorithm succeeds, (ii) random Parity Codes, where Gaussian elimination succeeds, and (iii) Gaussian Subset Sum, where lattice-based methods succeed. For all three, we further show that all low-degree polynomials are stable, yielding separations against low-degree methods and a new method to bound the low-degree MMSE. In particular, our technique highlights that MMSE instability is a common feature for Shortest Path and the noiseless Parity Codes and Gaussian subset sum. Last, we highlight that our work places rigorous algorithmic footing on the long-standing physics belief that first-order phase transitions—which in this setting translates to MMSE instability—impose fundamental limits on classes of efficient algorithms.}
}


@InProceedings{pmlr-v336-zamir26a,
  title = 	 {Optimal Variance-Dependent Regret Bounds for Infinite-Horizon MDPs},
  author =       {Zamir, Guy and Zurek, Matthew and Chen, Yudong},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7016--7061},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/zamir26a/zamir26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/zamir26a.html},
  abstract = 	 {Online reinforcement learning in infinite-horizon Markov decision processes (MDPs) remains less theoretically and algorithmically developed than its episodic counterpart, with many algorithms suffering from high “burn-in” costs and failing to adapt to benign instance-specific complexity. In this work, we address these shortcomings for two infinite-horizon objectives: the classical average-reward regret and the $\gamma$-regret. We develop a single tractable UCB-style algorithm applicable to both settings, which achieves the first optimal variance-dependent regret guarantees. Our regret bounds in both settings take the form $\widetilde{O}( \sqrt{SA\,\text{Var}} + \text{lower-order terms})$, where $S,A$ are the state and action space sizes, and $\text{Var}$ captures cumulative transition variance. This implies minimax-optimal average-reward and $\gamma$-regret bounds in the worst case but also adapts to easier problem instances, for example yielding nearly constant regret in deterministic MDPs. Furthermore, our algorithm enjoys significantly improved lower-order terms for the average-reward setting. With prior knowledge of the optimal bias span $\|h^\star\|_{\mathrm{sp}}$, our algorithm obtains lower-order terms scaling as $\|h^\star\|_{\mathrm{sp}}S^2 A$, which we prove is optimal in both $\|h^\star\|_{\mathrm{sp}}$ and $A$.  Without prior knowledge, we prove that no algorithm can have lower-order terms smaller than $\|h^\star\|_{\mathrm{sp}}^2SA$, and we provide a prior-free algorithm whose lower-order terms scale as $\|h^\star\|_{\mathrm{sp}}^2S^3A$, nearly matching this lower bound. Taken together, these results completely characterize the optimal dependence on $\|h^\star\|_{\mathrm{sp}}$ in both leading and lower-order terms, and reveal a fundamental gap in what is achievable with and without prior knowledge.}
}


@InProceedings{pmlr-v336-zhao26a,
  title = 	 {Gradient-Variation Regret Bounds for Unconstrained Online Learning},
  author =       {Zhao, Yuheng and Jacobsen, Andrew and Cesa-Bianchi, Nicol\`{o} and Zhao, Peng},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7062--7104},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/zhao26a/zhao26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/zhao26a.html},
  abstract = 	 {We develop parameter-free algorithms for unconstrained online learning with regret guarantees that scale with the gradient variation $V_T(u) = \sum_{t=2}^T \|\nabla f_t(u)-\nabla f_{t-1}(u)\|^2$. For $L$-smooth convex losses, we provide fully-adaptive algorithms achieving regret of $\widetilde{O}(\|u\|\sqrt{V_T(u)} + L\|u\|^2+G^4)$ without requiring prior knowledge of comparator norm $\|u\|$, Lipschitz constant $G$, or smoothness $L$. The update in each round can be computed efficiently via a closed-form expression. Our results extend to dynamic regret and find immediate implications for the stochastically-extended adversarial (SEA) model, which significantly improves upon the previous best-known result (Wang et al., 2025).}
}


@InProceedings{pmlr-v336-arvanitakis26a,
  title = 	 {Open Problem: How much overparametrization is needed for ALS in tensor decomposition?},
  author =       {Arvanitakis, Dionysis and Srinivas, Vaidehi and Vijayaraghavan, Aravindan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7105--7110},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/arvanitakis26a/arvanitakis26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/arvanitakis26a.html},
  abstract = 	 {We ask how much overparameterization is needed for simple iterative methods such as alternating least squares (ALS) and gradient descent to decompose a third-order tensor.  This question can be viewed as a basic setting to study feature learning: when a rank-$r$ tensor in ambient dimension $n$ has $r\ll n$, the latent rank-one components are the features, and $k$ is the amount of overparameterization used by the algorithm.   For rank $r$ tensors, recent work shows that overparametrized rank $k=O(r^2)$ suffices for the popular ALS heuristic  (with random initialization) to converge to a global optima.  Is the quadratic dependence on $r$ an inherent barrier for ALS-like methods?  We pose the open problem of proving convergence to the global optimum for $k=o(r^2)$, or proving that a lower bound on the overparametrized rank of $k=\Omega(r^{1+c})$ for some absolute constant $c>0$ is necessary.}
}


@InProceedings{pmlr-v336-balcan26a,
  title = 	 {Invited Open Problem: Online Optimization of Piecewise-Lipschitz Functions with Applications to Data-Driven Algorithm Design},
  author =       {Balcan, Maria-Florina and Pegden, Wesley and Sharma, Dravyansh},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7111--7116},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/balcan26a/balcan26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/balcan26a.html},
  abstract = 	 {Classical online optimization theory focuses on  regret guarantees for convex  Lipschitz functions. However, online optimization problems motivated by machine learning for algorithm design fall outside this regime, since typically an algorithm’s performance as a  function of its hyperparameters  is a highly volatile function. This has inspired recent work on online optimization of piecewise-Lipschitz functions with complex transition boundaries. We provide open questions in this direction.  Resolving these questions would  advance the  learning-theoretic foundation for adaptive algorithm design  by clarifying when  desirable sublinear regret guarantees are possible for learning  the algorithms from online problem instances.}
}


@InProceedings{pmlr-v336-feldman26a,
  title = 	 {Invited Open Problem: Is the Power of Deep Learning over Linear Models Inherently Distribution Dependent?},
  author =       {Feldman, Vitaly and Kamath, Pritish and Srebro, Nathan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7117--7122},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/feldman26a/feldman26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/feldman26a.html},
  abstract = 	 {We ask whether distribution-independent SQ learning implies low dimension complexity, and whether anything learnable with (S)GD on a (benign) neural network under any input distribution is also learnable with a linear model.}
}


@InProceedings{pmlr-v336-lau26a,
  title = 	 {Open Problem: Is Interaction Necessary for Order-Optimal 1-bit Mean Estimation?},
  author =       {Lau, Ivan and Scarlett, Jonathan},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7123--7128},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/lau26a/lau26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/lau26a.html},
  abstract = 	 {We ask whether interaction is necessary for order-optimal 1-bit mean estimation over nonparametric finite-moment classes. Adaptive threshold-query protocols achieve the order-optimal 1-bit minimax rate, and the same rate is attainable with general 1-bit queries using only one adaptive transition (i.e., two stages of querying). In the non-adaptive setting, threshold and interval queries are known to be highly suboptimal, but the case of arbitrary non-adaptive quantizers remains unresolved. Can such quantizers match the adaptive rate, yielding an optimal one-shot protocol? Or is the known two-stage estimator stage-optimal, with a single adaptive transition being necessary and sufficient?}
}


@InProceedings{pmlr-v336-nissim26a,
  title = 	 {Invited Open Problem: Does Differential Privacy Make PAC Learning Much Harder?},
  author =       {Nissim, Kobbi and Stemmer, Uri and Tsfadia, Eliad},
  booktitle = 	 {Proceedings of Thirty Ninth Conference on Learning Theory},
  pages = 	 {7129--7135},
  year = 	 {2026},
  editor = 	 {Hanneke, Steve and Lattimore, Tor},
  volume = 	 {336},
  series = 	 {Proceedings of Machine Learning Research},
  month = 	 {29 Jun--03 Jul},
  publisher =    {PMLR},
  pdf = 	 {https://raw.githubusercontent.com/mlresearch/v336/main/assets/nissim26a/nissim26a.pdf},
  url = 	 {https://proceedings.mlr.press/v336/nissim26a.html},
  abstract = 	 { What is the optimal sample complexity of differentially private (DP) PAC learning? Recent results establish that a concept class $C$ is learnable under approximate DP if and only if it is online learnable. However, in any realistic computational model, $C$ is finite, and it is well known that a sample complexity of $O(\log |C|)$ suffices for both online and DP learning. In contrast, non-private learning is characterized by the VC dimension of $C$, which can be significantly lower than $\log |C|$. While the gap between $\log |C|$ and $\text{VC}(C)$ can be unavoidable for online learning (e.g., when learning thresholds over a finite domain), we currently lack evidence that the same holds true for DP learning. This leads to our central question: Is differentially private PAC learning much harder than non-private learning? }
}