The property that is sufficient and necessary for a unique decoding is the separability of the tests, but unfortunately no linear time algorithm is known for such tests. In order to achieve linear-time decodable tests, the algorithms in the literature use the disjunction property that gives almost optimal number of tests.

We define a new property for the tests which we call semi-disjunction property. We show that there is a linear time decoding for such test and for $d\to \infty$ the number of tests converges to the number of tests with the separability property. Our analysis shows that, in the RID model, the number of tests in our algorithm is better than the one with the disjunction property even for small $d$.} } @InProceedings{hölzl17a, title = {Automatic Learning from Repetitive Texts}, author = {Rupert Hölzl and Sanjay Jain and Philipp Schlicht and Karen Seidel and Frank Stephan}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {129--150}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/hölzl17a/hölzl17a.pdf}, url = {http://proceedings.mlr.press/v76/h%C3%B6lzl17a.html}, abstract = {We study the connections between the learnability of automatic families of languages and the types of text used to present them to a learner. More precisely, we study how restrictions on the number of times that a correct datum appears in a text influence what classes of languages are automatically learnable. We show that an automatic family of languages is automatically learnable from fat text iff it is automatically learnable from thick text iff it is verifiable from balanced text iff it satisfies Angluin's tell-tale condition. Furthermore, many automatic families are automatically learnable from exponential text. We also study the relationship between automatic learnability and verifiability and show that all automatic families are automatically partially verifiable from exponential text and automatically learnable from thick text.} } @InProceedings{maillard17a, title = {Boundary Crossing for General Exponential Families}, author = {Odalric-Ambrym Maillard}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {151--184}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/maillard17a/maillard17a.pdf}, url = {http://proceedings.mlr.press/v76/maillard17a.html}, abstract = {We consider parametric exponential families of dimension $K$ on the real line. We study a variant of

In this paper, we investigate the difficulty of discovering anything about these implicit invariants. Unfortunately, our main results are negative: we show that a variety of questions around investigating invariant representations are NP-hard, even in approximate settings. Moreover, these results do not depend on the kind of architecture used: in fact, our results follow as soon as the network architecture is powerful enough to be universal. The key idea behind our results is that if we can find the symmetries of a problem then we can solve it.} } @InProceedings{mizumoto17a, title = {An efficient query learning algorithm for zero-suppressed binary decision diagrams}, author = {Hayato Mizumoto and Shota Todoroki and Diptarama and Ryo Yoshinaka and Ayumi Shinohara}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {360--371}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/mizumoto17a/mizumoto17a.pdf}, url = {http://proceedings.mlr.press/v76/mizumoto17a.html}, abstract = {A ZDD is a directed acyclic graph that represents a family of sets over a fixed universe set. In this paper, we propose an algorithm that learns zero-suppressed binary decision diagrams (ZDDs) using membership and equivalence queries. If the target ZDD has $n$ nodes and the cardinality of the universe is $m$, our algorithm uses $n$ equivalence queries and at most $n(\lfloor \log m \rfloor + 4n)$ membership queries to learn the target ZDD.} } @InProceedings{orseau17a, title = {Soft-Bayes: Prod for Mixtures of Experts with Log-Loss}, author = {Laurent Orseau and Tor Lattimore and Shane Legg}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {372--399}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/orseau17a/orseau17a.pdf}, url = {http://proceedings.mlr.press/v76/orseau17a.html}, abstract = {We consider prediction with expert advice under the log-loss with the goal of deriving efficient and robust algorithms. We argue that existing algorithms such as exponentiated gradient, online gradient descent and online Newton step do not adequately satisfy both requirements. Our main contribution is an analysis of the Prod algorithm that is robust to any data sequence and runs in linear time relative to the number of experts in each round. Despite the unbounded nature of the log-loss, we derive a bound that is independent of the largest loss and of the largest gradient, and depends only on the number of experts and the time horizon. Furthermore we give a Bayesian interpretation of Prod and adapt the algorithm to derive a tracking regret.} } @InProceedings{ryabko17b, title = {Hypotheses testing on infinite random graphs}, author = {Daniil Ryabko}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {400--411}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/ryabko17b/ryabko17b.pdf}, url = {http://proceedings.mlr.press/v76/ryabko17b.html}, abstract = {Drawing on some recent results that provide the formalism necessary to definite stationarity for infinite random graphs, this paper initiates the study of statistical and learning questions pertaining to these objects. Specifically, a criterion for the existence of a consistent test for complex hypotheses is presented, generalizing the corresponding results on time series. As an application, it is shown how one can test that a tree has the Markov property, or, more generally, to estimate its memory.} } @InProceedings{kotłowski17a, title = {Scale-Invariant Unconstrained Online Learning}, author = {Wojciech Kotłowski}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {412--433}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/kotłowski17a/kotłowski17a.pdf}, url = {http://proceedings.mlr.press/v76/kot%C5%82owski17a.html}, abstract = {We consider a variant of online convex optimization in which both the instances (input vectors) and the comparator (weight vector) are unconstrained. We exploit a natural scale invariance symmetry in our unconstrained setting: the predictions of the optimal comparator are invariant under any linear transformation of the instances. Our goal is to design online algorithms which also enjoy this property, i.e. are scale-invariant. We start with the case of coordinate-wise invariance, in which the individual coordinates (features) can be arbitrarily rescaled. We give an algorithm, which achieves essentially optimal regret bound in this setup, expressed by means of a coordinate-wise scale-invariant norm of the comparator. We then study general invariance with respect to arbitrary linear transformations. We first give a negative result, showing that no algorithm can achieve a meaningful bound in terms of scale-invariant norm of the comparator in the worst case. Next, we compliment this result with a positive one, providing an algorithm which "almost" achieves the desired bound, incurring only a logarithmic overhead in terms of the norm of the instances.} } @InProceedings{grohe17a, title = {Learning MSO-definable hypotheses on strings}, author = {Martin Grohe and Christof Löding and Martin Ritzert}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {434--451}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/grohe17a/grohe17a.pdf}, url = {http://proceedings.mlr.press/v76/grohe17a.html}, abstract = {We study the classification problems over string data for hypotheses specified by formulas of monadic second-order logic MSO. The goal is to design learning algorithms that run in time polynomial in the size of the training set, independently of or at least sublinear in the size of the whole data set. We prove negative as well as positive results. If the data set is an unprocessed string to which our algorithms have local access, then learning in sublinear time is impossible even for hypotheses definable in a small fragment of first-order logic. If we allow for a linear time pre-processing of the string data to build an index data structure, then learning of MSO-definable hypotheses is possible in time polynomial in the size of the training set, independently of the size of the whole data set.} } @InProceedings{angluin17a, title = {The Power of Random Counterexamples}, author = {Dana Angluin and Tyler Dohrn}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {452--465}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/angluin17a/angluin17a.pdf}, url = {http://proceedings.mlr.press/v76/angluin17a.html}, abstract = {Learning a target concept from a finite $n \times m$ concept space requires $\Omega{(n)}$ proper equivalence queries in the worst case. We propose a variation of the usual equivalence query in which the teacher is constrained to choose counterexamples randomly from a known probability distribution on examples. We present and analyze the Max-Min learning algorithm, which identifies an arbitrary target concept in an arbitrary finite $n \times m$ concept space using at most an expected $\log_2{n}$ proper equivalence queries with random counterexamples.} } @InProceedings{thiemann17a, title = {A Strongly Quasiconvex PAC-Bayesian Bound}, author = {Niklas Thiemann and Christian Igel and Olivier Wintenberger and Yevgeny Seldin}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {466--492}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/thiemann17a/thiemann17a.pdf}, url = {http://proceedings.mlr.press/v76/thiemann17a.html}, abstract = {We propose a new PAC-Bayesian bound and a way of constructing a hypothesis space, so that the bound is convex in the posterior distribution and also convex in a trade-off parameter between empirical performance of the posterior distribution and its complexity. The complexity is measured by the Kullback-Leibler divergence to a prior. We derive an alternating procedure for minimizing the bound. We show that the bound can be rewritten as a one-dimensional function of the trade-off parameter and provide sufficient conditions under which the function has a single global minimum. When the conditions are satisfied the alternating minimization is guaranteed to converge to the global minimum of the bound. We provide experimental results demonstrating that rigorous minimization of the bound is competitive with cross-validation in tuning the trade-off between complexity and empirical performance. In all our experiments the trade-off turned to be quasiconvex even when the sufficient conditions were violated.} } @InProceedings{kötzing17a, title = {Normal Forms in Semantic Language Identification}, author = {Timo Kötzing and Martin Schirneck and Karen Seidel}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {493--516}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/kötzing17a/kötzing17a.pdf}, url = {http://proceedings.mlr.press/v76/k%C3%B6tzing17a.html}, abstract = {We consider language learning in the limit from text where all learning restrictions are semantic, that is, where any conjecture may be replaced by a semantically equivalent conjecture. For different such learning criteria, starting with the well-known $\mathbf{Txt}\mathbf{G}\mathbf{Bc}$-learning, we consider three different normal forms: strongly locking learning, consistent learning and (partially) set-driven learning. These normal forms support and simplify proofs and give insight into what behaviors are necessary for successful learning (for example when consistency in conservative learning implies cautiousness and strong decisiveness).

We show that strongly locking learning can be assumed for partially set-driven learners, even when learning restrictions apply. We give a very general proof relying only on a natural property of the learning restriction, namely, allowing for simulation on equivalent text. Furthermore, when no restrictions apply, also the converse is true: every strongly locking learner can be made partially set-driven. For several semantic learning criteria we show that learning can be done consistently. Finally, we deduce for which learning restrictions partial set-drivenness and set-drivenness coincide, including a general statement about classes of infinite languages. The latter again relies on a simulation argument.} } @InProceedings{mourtada17a, title = {Efficient tracking of a growing number of experts}, author = {Jaouad Mourtada and Odalric-Ambrym Maillard}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {517--539}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/mourtada17a/mourtada17a.pdf}, url = {http://proceedings.mlr.press/v76/mourtada17a.html}, abstract = {We consider a variation on the problem of prediction with expert advice, where new forecasters that were unknown until then may appear at each round. As often in prediction with expert advice, designing an algorithm that achieves near-optimal regret guarantees is straightforward, using aggregation of experts. However, when the comparison class is sufficiently rich, for instance when the best expert and the set of experts itself changes over time, such strategies naively require to maintain a prohibitive number of weights (typically exponential with the time horizon). By contrast, designing strategies that both achieve a near-optimal regret and maintain a reasonable number of weights is highly non-trivial. We consider three increasingly challenging objectives (simple regret, shifting regret and sparse shifting regret) that extend existing notions defined for a fixed expert ensemble; in each case, we design strategies that achieve tight regret bounds, adaptive to the parameters of the comparison class, while being computationally inexpensive. Moreover, our algorithms are anytime, agnostic to the number of incoming experts and completely parameter-free. Such remarkable results are made possible thanks to two simple but highly effective recipes: first the "abstention trick" that comes from the

As corollaries, we obtain algorithms for high dimensional mean estimation and stochastic convex optimization in these models that work in more general settings than previously known solutions.} } @InProceedings{wang17a, title = {Learning from Networked Examples}, author = {Yuyi Wang and Zheng-Chu Guo and Jan Ramon}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {641--666}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/wang17a/wang17a.pdf}, url = {http://proceedings.mlr.press/v76/wang17a.html}, abstract = {Many machine learning algorithms are based on the assumption that training examples are drawn independently. However, this assumption does not hold anymore when learning from a networked sample because two or more training examples may share some common objects, and hence share the features of these shared objects. We show that the classic approach of ignoring this problem potentially can have a harmful effect on the accuracy of statistics, and then consider alternatives. One of these is to only use independent examples, discarding other information. However, this is clearly suboptimal. We analyze sample error bounds in this networked setting, providing significantly improved results. An important component of our approach is formed by efficient sample weighting schemes, which leads to novel concentration inequalities.} } @InProceedings{ding17a, title = {PAC Learning Depth-3 $\textrm{AC}^0$ Circuits of Bounded Top Fanin}, author = {Ning Ding and Yanli Ren and Dawu Gu}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {667--680}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/ding17a/ding17a.pdf}, url = {http://proceedings.mlr.press/v76/ding17a.html}, abstract = {An important and long-standing question in computational learning theory is how to learn $\textrm{AC}^0$ circuits with respect to any distribution (i.e. PAC learning). All previous results either require that the underlying distribution is uniform Linial et al. (1993) (or simple variants of the uniform distribution) or restrict the depths of circuits being learned to 1 Valiant (1984) and 2 Klivans and Servedio (2004). As for the circuits of depth 3 or more, it is currently unknown how to PAC learn them. \newline In this paper we present an algorithm to PAC learn depth-3 $\textrm{AC}^0$ circuits of bounded top fanin over $(x_1,\cdots,x_n,\overline{x}_1,\cdots,\overline{x}_n)$. Our result is that every depth-3 $\textrm{AC}^0$ circuit of top fanin $K$ can be computed by a polynomial threshold function (PTF) of degree $\widetilde{O}(K\cdot n^{\frac{1}{2}})$, which means that it can be PAC learned in time $2^{\widetilde{O}(K\cdot n^{\frac{1}{2}})}$. In particular, when $K=O(n^{\epsilon_0})$ for any $\epsilon_0<\frac{1}{2}$, the time for learning is sub-exponential. We note that instead of employing some known tools we use some specific approximation in expressing such circuits in PTFs which can thus save a factor of $\textrm{polylog}(n)$ in degrees of the PTFs.} } @InProceedings{joulani17a, title = {A Modular Analysis of Adaptive (Non-)Convex Optimization: Optimism, Composite Objectives, and Variational Bounds}, author = {Pooria Joulani and András György and Csaba Szepesvári}, booktitle = {Proceedings of the 28th International Conference on Algorithmic Learning Theory}, pages = {681--720}, year = {2017}, editor = {Steve Hanneke and Lev Reyzin}, volume = {76}, series = {Proceedings of Machine Learning Research}, address = {Kyoto University, Kyoto, Japan}, month = {15--17 Oct}, publisher = {PMLR}, pdf = {http://proceedings.mlr.press/v76/joulani17a/joulani17a.pdf}, url = {http://proceedings.mlr.press/v76/joulani17a.html}, abstract = {Recently, much work has been done on extending the scope of online learning and incremental stochastic optimization algorithms. In this paper we contribute to this effort in two ways: First, based on a new regret decomposition and a generalization of Bregman divergences, we provide a self-contained, modular analysis of the two workhorses of online learning: (general) adaptive versions of Mirror Descent (MD) and the Follow-the-Regularized-Leader (FTRL) algorithms. The analysis is done with extra care so as not to introduce assumptions not needed in the proofs and allows to combine, in a straightforward way, different algorithmic ideas (e.g., adaptivity, optimism, implicit updates) and learning settings (e.g., strongly convex or composite objectives). This way we are able to reprove, extend and refine a large body of the literature, while keeping the proofs concise. The second contribution is a byproduct of this careful analysis: We present algorithms with improved variational bounds for smooth, composite objectives, including a new family of optimistic MD algorithms with only one projection step per round. Furthermore, we provide a simple extension of adaptive regret bounds to practically relevant non-convex problem settings with essentially no extra effort.} }