@Proceedings{ACML2014,
title = {Proceedings of Machine Learning Research},
booktitle = {Proceedings of Machine Learning Research},
editor = {Dinh Phung and Hang Li},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 39
}
@InProceedings{preface,
title = {Preface},
author = {Dinh Phung and Hang Li},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {i--xiv},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/preface.pdf},
url = {http://proceedings.mlr.press/v39/preface.html},
abstract = {Preface to ACML 2014}
}
@InProceedings{tanaka14,
title = {Theoretical Analyses on Ensemble and Multiple Kernel Regressors},
author = {Akira Tanaka and Ichigaku Takigawa and Hideyuki Imai and Mineichi Kudo},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {1--15},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/tanaka14.pdf},
url = {http://proceedings.mlr.press/v39/tanaka14.html},
abstract = {For the last few decades, a combination of different learning machines so-called ensemble learning, including learning with multiple kernels, has attracted much attention in the field of machine learning. Although its efficacy was revealed numerically in many works, its theoretical grounds are not investigated sufficiently. In this paper, we discuss regression problems with a class of kernels whose corresponding reproducing kernel Hilbert spaces have a common subspace with an invariant metric and prove that the ensemble kernel regressor (the mean of kernel regressors with those kernels) gives a better learning result than the multiple kernel regressor (the kernel regressor with the sum of those kernels) in terms of the generalization ability of a model space.}
}
@InProceedings{sun14,
title = {Sparsity on Statistical Simplexes and Diversity in Social Ranking},
author = {Ke Sun and Hisham Mohamed and Stephane Marchand-Maillet},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {16--31},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/sun14.pdf},
url = {http://proceedings.mlr.press/v39/sun14.html},
abstract = {Sparsity in \Re^m has been widely explored in machine learning. We study sparsity on a statistical simplex consisting of all categorical distributions. This is different from the case in \Re^m because such a simplex is a Riemannian manifold, a curved space. A learner with sparse constraints should be likely to fall to its low-dimensional boundaries. We present a novel analysis on the statistical simplex as a manifold with boundary. The main contribution is an explicit view of the learning dynamics in between high-dimensional models in the interior of the simplex and low-dimensional models on its boundaries. We prove the differentiability of the cost function, the natural gradient with respect to the Riemannian structure, and convexity around the singular regions. We uncover an interesting relationship with L_1 regularization. We apply the proposed technique to social network analysis. Given a directed graph, the task is to rank a subset of influencer nodes. Here, sparsity means that the top-ranked nodes should present diversity in the sense of minimizing influence overlap. We present a ranking algorithm based on the natural gradient. It can scale up to graph datasets with millions of nodes. On real large networks, the top-ranked nodes are the most informative among several commonly-used techniques.}
}
@InProceedings{alabdulmohsin14,
title = {Support vector machines with indefinite kernels},
author = {Ibrahim Alabdulmohsin and Xin Gao and Xiangliang Zhang Zhang},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {32--47},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/alabdulmohsin14.pdf},
url = {http://proceedings.mlr.press/v39/alabdulmohsin14.html},
abstract = {Training support vector machines (SVM) with indefinite kernels has recently attracted attention in the machine learning community. This is partly due to the fact that many similarity functions that arise in practice are not symmetric positive semidefinite, i.e. the Mercer condition is not satisfied, or the Mercer condition is difficult to verify. Previous work on training SVM with indefinite kernels has generally fallen into three categories: (1) positive semidefinite kernel approximation, (2) non-convex optimization, and (3) learning in Krein spaces. All approaches are not fully satisfactory. They have either introduced sources of inconsistency in handling training and test examples using kernel approximation, settled for approximate local minimum solutions using non-convex optimization, or produced non-sparse solutions. In this paper, we establish both theoretically and experimentally that the 1-norm SVM, proposed more than 10 years ago for embedded feature selection, is a better solution for extending SVM to indefinite kernels. More specifically, 1-norm SVM can be interpreted as a structural risk minimization method that seeks a decision boundary with large similarity margin in the original space. It uses a linear programming formulation that remains convex even if the kernel matrix is indefinite, and hence can always be solved quite efficiently. Also, it uses the indefinite similarity function (or distance) directly without any transformation, and, hence, it always treats both training and test examples consistently. Finally, it achieves the highest accuracy among all methods that train SVM with indefinite kernels with a statistically significant evidence while also retaining sparsity of the support vector set.}
}
@InProceedings{canevet14a,
title = {Efficient Sample Mining for Object Detection},
author = {Olivier Canevet and Francois Fleuret},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {48--63},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/canevet14a.pdf},
url = {http://proceedings.mlr.press/v39/canevet14a.html},
abstract = {Object detectors based on the sliding window technique are usually trained in two successive steps: first, an initial classifier is trained on a population of positive samples (i.e. images of the object to detect) and negative samples randomly extracted from scenes which do not contain the object to detect. Then, the scenes are scanned with that initial classifier to enrich the initial set with negative samples incorrectly classified as positive. This bootstrapping process provides the learning algorithm with "hard" samples, which help to improve the decision boundary. Little work has been done on how to efficiently enrich the training set. While the standard bootstrapping approach densely visits the scenes, we propose to evaluate which regions of scenes can be discarded without any further computation to concentrate the search on promising areas. We apply our method to two standard object detection settings, pedestrian and face detection, and show that it provides a multi-fold speed up.}
}
@InProceedings{canevet14b,
title = {Sample Distillation for Object Detection and Image Classification},
author = {Olivier Canevet and Leonidas Lefakis and Francois Fleuret},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {64--79},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/canevet14b.pdf},
url = {http://proceedings.mlr.press/v39/canevet14b.html},
abstract = {We propose a novel approach to efficiently select informative samples for large-scale learning. Instead of directly feeding a learning algorithm with a very large amount of samples, as it is usually done to reach state-of-the-art performance, we have developed a "distillation" procedure to recursively reduce the size of an initial training set using a criterion that ensures the maximization of the information content of the selected sub-set. We demonstrate the performance of this procedure for two different computer vision problems. First, we show that distillation can be used to improve the traditional bootstrapping approach to object detection. Second, we apply distillation to a classification problem with artificial distortions. We show that in both cases, using the result of a distillation process instead of a random sub-set taken uniformly in the original sample set improves performance significantly.}
}
@InProceedings{than14,
title = {Dual online inference for latent {D}irichlet allocation},
author = {Khoat Than and Tung Doan},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {80--95},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/than14.pdf},
url = {http://proceedings.mlr.press/v39/than14.html},
abstract = {Latent Dirichlet allocation (LDA) provides an efficient tool to analyze very large text collections. In this paper, we discuss three novel contributions: (1) a proof for the tractability of the MAP estimation of topic mixtures under certain conditions that might fit well with practices, even though the problem is known to be intractable in the worst case; (2) a provably fast algorithm (OFW) for inferring topic mixtures; (3) a dual online algorithm (DOLDA) for learning LDA at a large scale. We show that OFW converges to some local optima, but under certain conditions it can converge to global optima. The discussion of OFW is very general and hence can be readily employed to accelerate the MAP estimation in a wide class of probabilistic models. From extensive experiments we find that DOLDA can achieve significantly better predictive performance and more interpretable topics, with lower runtime, than stochastic variational inference. Further, DOLDA enables us to easily analyze text streams or millions of documents.}
}
@InProceedings{tagawa14,
title = {Structured Denoising Autoencoder for Fault Detection and Analysis},
author = {Takaaki Tagawa and Yukihiro Tadokoro and Takehisa Yairi},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {96--111},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/tagawa14.pdf},
url = {http://proceedings.mlr.press/v39/tagawa14.html},
abstract = {This paper proposes a new fault detection and analysis approach which can leverage incomplete prior information. Conventional data-driven approaches suffer from the problem of overfitting and result in high rates of false positives, and model-driven approaches suffer from a lack of specific information about complex systems. We overcome these problems by modifying the denoising autoencoder (DA), a data-driven method, to form a new approach, called the structured denoising autoencoder (SDA), which can utilize incomplete prior information. The SDA does not require specific information and can perform well without overfitting. In particular, an empirical analysis with synthetic data revealed that the SDA performs better than the DA even when there is partially incorrect or abstract information. An evaluation using real data from moving cars also showed that the SDA with incomplete knowledge outperformed conventional methods. Surprisingly, the SDA results were better even though the parameters of the conventional methods were tuned using faulty data, which are normally unknown. In addition, the SDA fault analysis was able to extract the true causes of the changes within the faulty data; the other methods were unable to do this. Thus, only our proposed method can explain why the faults occurred.}
}
@InProceedings{klami14,
title = {{P}olya-gamma augmentations for factor models},
author = {Arto Klami},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {112--128},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/klami14.pdf},
url = {http://proceedings.mlr.press/v39/klami14.html},
abstract = {Bayesian inference for latent factor models, such as principal component and canonical correlation analysis, is easy for Gaussian likelihoods. In particular, full conjugacy makes both Gibbs samplers and mean-field variational approximations straightforward. For other likelihood potentials one needs to either resort to more complex sampling schemes or to specifying dedicated forms of variational lower bounds. Recently, however, it was shown that for specific likelihoods related to the logistic function it is possible to augment the joint density with auxiliary variables following a Polya-Gamma distribution, leading to closed-form updates for binary and over-dispersed count models. In this paper we describe how Gibbs sampling and mean-field variational approximation for various latent factor models can be implemented for these cases, presenting easy-to-implement and efficient inference schemas.}
}
@InProceedings{kimura14,
title = {A Fast Hierarchical Alternating Least Squares Algorithm for Orthogonal Nonnegative Matrix Factorization},
author = {Keigo Kimura and Yuzuru Tanaka and Mineichi Kudo},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {129--141},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/kimura14.pdf},
url = {http://proceedings.mlr.press/v39/kimura14.html},
abstract = {Nonnegative Matrix Factorization (NMF) is a popular technique in a variety of fields due to its component-based representation with physical interpretablity. NMF finds a nonnegative hidden structures as oblique bases and coefficients. Recently, Orthogonal NMF (ONMF), imposing an orthogonal constraint into NMF, has been gathering a great deal of attention. ONMF is more appropriate for the clustering task because the resultant constrained matrix consisting of the coefficients can be considered as an indicator matrix. All traditional ONMF algorithms are based on multiplicative update rules or project gradient descent method. However, these algorithms are slow in convergence compared with the state-of-the-art algorithms used for regular NMF. This is because they update a matrix in each iteration step. In this paper, therefore, we propose to update the current matrix column-wisely using Hierarchical Alternating Least Squares algorithm (HALS) that is typically used for NMF. The orthogonality and nonnegativity constraints are both utilized efficiently in the column-wise update procedure. Through theoretical analysis and experiments on six real-life datasets, it was shown that the proposed algorithm converges faster than the other conventional ONMF algorithms due to a smaller number of iterations, although the theoretical complexity is the same. It was also shown that the orthogonality is also attained in an earlier stage.}
}
@InProceedings{lim14,
title = {Bibliographic Analysis with the Citation Network Topic Model},
author = {Kar Wai Lim and Wray Buntine},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {142--158},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/lim14.pdf},
url = {http://proceedings.mlr.press/v39/lim14.html},
abstract = {Bibliographic analysis considers author’s research areas, the citation network and paper content among other things. In this paper, we combine these three in a topic model that produces a bibliographic model of authors, topics and documents using a non-parametric extension of a combination of the Poisson mixed-topic link model and the author-topic model. We propose a novel and efficient inference algorithm for the model to explore subsets of research publications from CiteSeerX. Our model demonstrates improved performance in both model fitting and clustering task comparing to several baselines.}
}
@InProceedings{givchi14,
title = {Quasi Newton Temporal Difference Learning},
author = {Arash Givchi and Maziar Palhang},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {159--172},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/givchi14.pdf},
url = {http://proceedings.mlr.press/v39/givchi14.html},
abstract = {Fast convergent and computationally inexpensive policy evaluation is an essential part of reinforcement learning algorithms based on policy iteration. Algorithms such as LSTD, LSPE, FPKF and NTD, have faster convergence rate but they are computationally slow. On the other hand, there are algorithms that are computationally fast but with slower convergence rate, among them are TD, RG, GTD2 and TDC. This paper presents a regularized Quasi Newton Temporal Difference learning algorithm which uses the second-order information while maintaining a fast convergence rate. In simple language, we combine the idea of TD learning with Quasi Newton algorithm SGD-QN. We explore the development of QNTD algorithm and discuss its convergence properties. We support our ideas with empirical results on 4 standard benchmarks in reinforcement learning literature with 2 small problems, Random Walk and Boyan chain and 2 bigger problems, cart-pole and linked-pole balancing. Empirical studies show that QNTD speeds up convergence and provides better accuracy in comparison to the conventional TD.}
}
@InProceedings{auger14,
title = {Sparse binary zero-sum games},
author = {David Auger and Jianlin Liu and Sylkvie Ruette and David Saint-Pierre and Oliver Teytaud},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {173--188},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/auger14.pdf},
url = {http://proceedings.mlr.press/v39/auger14.html},
abstract = {Solving zero-sum matrix games is polynomial, because it boils down to linear programming. The approximate solving is sublinear by randomized algorithms on machines with random access memory. Algorithms working separately and independently on columns and rows have been proposed, with the same performance; these versions are compliant with matrix games with stochastic reward. [1] has proposed a new version, empirically performing better on sparse problems, i.e. cases in which the Nash equilibrium has small support. In this paper, we propose a variant, similar to their work, also dedicated to sparse problems, with provably better bounds than existing methods. We then experiment the method on a card game.}
}
@InProceedings{antoniuk14,
title = {Interval Insensitive Loss for Ordinal Classification},
author = {Kostiantyn Antoniuk and Vojtech Franc and Vaclav Hlavac},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {189--204},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/antoniuk14.pdf},
url = {http://proceedings.mlr.press/v39/antoniuk14.html},
abstract = {We address a problem of learning ordinal classifier from partially annotated examples. We introduce an interval-insensitive loss function to measure discrepancy between predictions of an ordinal classifier and a partial annotation provided in the form of intervals of admissible labels. The proposed interval-insensitive loss is an instance of loss functions previously used for learning of different classification models from partially annotated examples. We propose several convex surrogates of the interval-insensitive loss which can be efficiently optimized by existing solvers. Experiments on standard benchmarks and a real-life application show that learning ordinal classifiers from partially annotated examples is competitive to the so-far used methods learning from the complete annotation.}
}
@InProceedings{xiong14,
title = {Towards Maximum Likelihood: Learning Undirected Graphical Models using Persistent Sequential Monte Carlo},
author = {Hanchen Xiong and Sandor Szedmak and Justus Piater},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {205--220},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/xiong14.pdf},
url = {http://proceedings.mlr.press/v39/xiong14.html},
abstract = {Along with the emergence of algorithms such as persistent contrastive divergence (PCD), tempered transition and parallel tempering, the past decade has witnessed a revival of learning undirected graphical models (UGMs) with sampling-based approximations. In this paper, based upon the analogy between Robbins-Monro’s stochastic approximation procedure and sequential Monte Carlo (SMC), we analyze the strengths and limitations of state-of-the-art learning algorithms from an SMC point of view. Moreover, we apply the rationale further in sampling at each iteration, and propose to learn UGMs using persistent sequential Monte Carlo (PSMC). The whole learning procedure is based on the samples from a long, persistent sequence of distributions which are actively constructed. Compared to the above-mentioned algorithms, one critical strength of PSMC- based learning is that it can explore the sampling space more effectively. In particular, it is robust when learning rates are large or model distributions are high-dimensional and thus multi-modal, which often causes other algorithms to deteriorate. We tested PSMC learning, also with other related methods, on carefully-designed experiments with both synthetic and real-word data, and our empirical results demonstrate that PSMC compares favorably with the state of the art.}
}
@InProceedings{zhang14,
title = {Nonlinear Dimensionality Reduction of Data by Deep Distributed Random Samplings},
author = {Xiao-Lei Zhang},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {221--233},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/zhang14.pdf},
url = {http://proceedings.mlr.press/v39/zhang14.html},
abstract = {Dimensionality reduction is a fundamental problem of machine learning, and has been intensively studied, where classification and clustering are two special cases of dimensionality reduction that reduce high-dimensional data to discrete points. Here we describe a simple multilayer network for dimensionality reduction that each layer of the network is a group of mutually independent k-centers clusterings. We find that the network can be trained successfully layer-by-layer by simply assigning the centers of each clustering by randomly sampled data points from the input. Our results show that the described simple method outperformed 7 well-known dimensionality reduction methods on both very small-scale biomedical data and large-scale image and document data, with less training time than multilayer neural networks on large-scale data.}
}
@InProceedings{zhu14,
title = {Learning with Augmented Multi-Instance View},
author = {Yue Zhu and Jianxin Wu and Yuan Jiang and Zhi-Hua Zhou},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {234--249},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/zhu14.pdf},
url = {http://proceedings.mlr.press/v39/zhu14.html},
abstract = {In this paper, we propose the Augmented Multi-Instance View (AMIV) framework to construct a better model by exploiting augmented information. For example, abstract screening tasks may be difficult because only abstract information is available, whereas the performance can be improved when the abstracts of references listed in the document can be exploited as augmented information. If each abstract is represented as an instance (i.e., a feature vector) x, then with the augmented information, it can be represented as an instance-bag pair (x;B), where B is a bag of instances (i.e., the abstracts of references). Note that if x has a label y, then we assume that there must exist at least one instance in the bag B having the label y. We regard x and B as two views, i.e., a single-instance view augmented with a multi-instance view, and propose the AMIV-lss approach by establishing a latent semantic subspace between the two views. The AMIV framework can be applied when the augmented information is presented as multi-instance bags and to the best of our knowledge, such a learning with augmented multi-instance view problem has not been touched before. Experimental results on twelve TechPaper datasets, five PubMed data sets and a WebPage data set validate the effectiveness of our AMIV-lss approach.}
}
@InProceedings{moridomi14,
title = {Online matrix prediction for sparse loss matrices},
author = {Ken-ichiro Moridomi and Kohei Hatano and Eiji Takimoto and Koji Tsuda},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {250--265},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/moridomi14.pdf},
url = {http://proceedings.mlr.press/v39/moridomi14.html},
abstract = {We consider an online matrix prediction problem. The FTRL is a famous method to deal with online prediction task, which makes prediction by minimizing cumulative loss function and regularizer function. There are three popular regularizer functions for matrices, Frobenius norm, quantum relative entropy and log-determinant. We propose a FTRL based algorithm with log-determinant as regularizer and show regret bound of algorithm. Our main contribution is to show that log-determinant regularization is efficient when sparse loss function setting. We also show the optimal performance algorithm for online collaborative filtering problem with log-determinant regularization.}
}
@InProceedings{lu14,
title = {Online Passive Aggressive Active Learning and Its Applications},
author = {Jing Lu and Peilin Zhao and Steven Hoi},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {266--282},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/lu14.pdf},
url = {http://proceedings.mlr.press/v39/lu14.html},
abstract = {We investigate online active learning techniques for classification tasks in data stream mining applications. Unlike traditional learning approaches (either batch or online learning) that often require to request class label of each incoming instance, online active learning queries only a subset of informative incoming instances to update the classification model, which aims to maximize the classification performance using the minimal human labeling effort during the entire online stream data mining tasks. In this paper, we present a new family of algorithms for online active learning called Passive-Aggressive Active (PAA) learning algorithms by adapting the popular Passive-Aggressive algorithms in an online active learning setting. Unlike the conventional Perceptron-based approach that employs only the misclassified instances for updating the model, the proposed PAA learning algorithms not only use the misclassified instances to update the classifier, but also exploit those correctly classified examples yet with low prediction confidence. We theoretically analyze the mistakes bounds of the proposed algorithms and conduct extensive experiments to examine their empirical performance, in which the encouraging results show clear advantages of our algorithms over the baselines.}
}
@InProceedings{liu14,
title = {Ordinal Random Fields for Recommender Systems},
author = {Shaowu Liu and Truyen Tran and Gang Li},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {283--298},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/liu14.pdf},
url = {http://proceedings.mlr.press/v39/liu14.html},
abstract = {Recommender Systems heavily rely on numerical preferences, whereas the importance of ordinal preferences has only been recognised in recent works of Ordinal Matrix Factorisation (OMF). Although the OMF can effectively exploit ordinal properties, it captures only the higher-order interactions among users and items, without considering the localised interactions properly. This paper employs Markov Random Fields (MRF) to investigate the localised interactions, and proposes a unified model called Ordinal Random Fields (ORF) to take advantages of both the representational power of the MRF and the ease of modelling ordinal preferences by the OMF. Experimental result on public datasets demonstrates that the proposed ORF model can capture both types of interactions, resulting in improved recommendation accuracy.}
}
@InProceedings{daswani14,
title = {Reinforcement learning with value advice},
author = {Mayank Daswani and Peter Sunehag and Marcus Hutter},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {299--314},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/daswani14.pdf},
url = {http://proceedings.mlr.press/v39/daswani14.html},
abstract = {The problem we consider in this paper is reinforcement learning with value advice. In this setting, the agent is given limited access to an oracle that can tell it the expected return (value) of any state-action pair with respect to the optimal policy. The agent must use this value to learn an explicit policy that performs well in the environment. We provide an algorithm called RLAdvice, based on the imitation learning algorithm DAgger. We illustrate the effectiveness of this method in the Arcade Learning Environment on three different games, using value estimates from UCT as advice.}
}
@InProceedings{nakamura14,
title = {A {UCB}-Like Strategy of Collaborative Filtering},
author = {Atsuyoshi Nakamura},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {315--329},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/nakamura14.pdf},
url = {http://proceedings.mlr.press/v39/nakamura14.html},
abstract = {We consider a direct mail problem in which a system repeats the following process every day during some period: select a set of user-item pairs (u,i), send a recommendation mail of item i to user u for each selected pair (u,i), and receive a response from each user. We assume that each response can be obtained before the next process and through the response, the system can know the user’s evaluation of the recommended item directly or indirectly. Each pair (u,i) can be selected at most once during the period. If the total number of selections is very small compared to the number of entries in the whole user-item matrix, what selection strategy should be used to maximize the total sum of users’ evaluations during the period? We consider a UCB-like strategy for this problem, and show two methods using the strategy. The effectiveness of our methods are demonstrated by experiments using synthetic and real datasets.}
}
@InProceedings{ko14,
title = {Variational Gaussian Inference for Bilinear Models of Count Data},
author = {Young-Jun Ko and Mohammad Khan},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {330--343},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/ko14.pdf},
url = {http://proceedings.mlr.press/v39/ko14.html},
abstract = {Bilinear models of count data with Poisson distribution are popular in applications such as matrix factorization for recommendation systems, modeling of receptive fields of sensory neurons, and modeling of neural-spike trains. Bayesian inference in such models remains challenging due to the product term of two Gaussian random vectors. In this paper, we propose new algorithms for such models based on variational Gaussian (VG) inference. We make two contributions. First, we show that the VG lower bound for these models, previously known to be intractable, is available in closed form under certain non-trivial constraints on the form of the posterior. Second, we show that the lower bound is bi- concave and can be efficiently optimized for mean-field approximations. We also show that bi-concavity generalizes to the larger family of log-concave likelihoods that subsume the Poisson distribution. We present new inference algorithms based on these results and demonstrate better performance on real-world problems at the cost of a modest increase in computation. Our contributions in this paper, therefore, provide more choices for Bayesian inference in terms of a speed-vs-accuracy tradeoff.}
}
@InProceedings{chou14,
title = {Pseudo-reward Algorithms for Contextual Bandits with Linear Payoff Functions},
author = {Ku-Chun Chou and Hsuan-Tien Lin and Chao-Kai Chiang and Chi-Jen Lu},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {344--359},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/chou14.pdf},
url = {http://proceedings.mlr.press/v39/chou14.html},
abstract = {We study the contextual bandit problem with linear payoff functions, which is a generalization of the traditional multi-armed bandit problem. In the contextual bandit problem, the learner needs to iteratively select an action based on an observed context, and receives a linear score on only the selected action as the reward feedback. Motivated by the observation that better performance is achievable if the other rewards on the non-selected actions can also be revealed to the learner, we propose a new framework that feeds the learner with pseudo-rewards, which are estimates of the rewards on the non-selected actions. We argue that the pseudo-rewards should better contain over-estimates of the true rewards, and propose a forgetting mechanism to decrease the negative influence of the over-estimation in the long run. Then, we couple the two key ideas above with the linear upper confidence bound (LinUCB) algorithm to design a novel algorithm called linear pseudo-reward upper confidence bound (LinPRUCB). We prove that LinPRUCB shares the same order of regret bound to LinUCB, while enjoying the practical observation of faster reward-gathering in the earlier iterations. Experiments on artificial and real-world data sets justify that LinPRUCB is competitive to and sometimes even better than LinUCB. Furthermore, we couple LinPRUCB with a special parameter to formalize a new algorithm that yields faster computation in updating the internal models while keeping the promising practical performance. The two properties match the real-world needs of the contextual bandit problem and make the new algorithm a favorable choice in practice.}
}
@InProceedings{oliveira14,
title = {Ensembles for Time Series Forecasting},
author = {Mariana Oliveira and Luis Torgo},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {360--370},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/oliveira14.pdf},
url = {http://proceedings.mlr.press/v39/oliveira14.html},
abstract = {This paper describes a new type of ensembles that aims at improving the predictive performance of these approaches in time series forecasting. Ensembles are recognised as one of the most successful approaches to prediction tasks. Previous theoretical studies of ensembles have shown that one of the key reasons for this performance is diversity among ensemble members. Several methods exist to generate diversity. The key idea of the work we are presenting here is to propose a new form of diversity generation that explores some specific properties of time series prediction tasks. Our hypothesis is that the resulting ensemble members will be better at addressing different dynamic regimes of time series data. Our large set of experiments confirms that the methods we have explored for generating diversity are able to improve the performance of the equivalent ensembles with standard diversity generation procedures.}
}
@InProceedings{lin14,
title = {Reduction from Cost-Sensitive Multiclass Classification to One-versus-One Binary Classification},
author = {Hsuan-Tien Lin},
booktitle = {Proceedings of the Sixth Asian Conference on Machine Learning},
pages = {371--386},
year = {2015},
editor = {Dinh Phung and Hang Li},
volume = {39},
series = {Proceedings of Machine Learning Research},
address = {Nha Trang City, Vietnam},
month = {26--28 Nov},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v39/lin14.pdf},
url = {http://proceedings.mlr.press/v39/lin14.html},
abstract = {Many real-world applications require varying costs for different types of mis-classification errors. Such a cost-sensitive classification setup can be very different from the regular classification one, especially in the multiclass case. Thus, traditional meta-algorithms for regular multiclass classification, such as the popular one-versus-one approach, may not always work well under the cost-sensitive classification setup. In this paper, we extend the one-versus-one approach to the field of cost-sensitive classification. The extension is derived using a rigorous mathematical tool called the cost-transformation technique, and takes the original one-versus-one as a special case. Experimental results demonstrate that the proposed approach can achieve better performance in many cost-sensitive classification scenarios when compared with the original one-versus-one as well as existing cost-sensitive classification algorithms.}
}