@Proceedings{SSDM2014,
title = {Proceedings of the Workshop on Statistically Sound Data Mining at ECML/PKDD},
booktitle = {Proceedings of the Workshop on Statistically Sound Data Mining at ECML/PKDD},
editor = {Wilhelmiina Hämäläinen and François Petitjean and I. Webb},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
volume = 47
}
@InProceedings{pmlr-v47-preface,
title = {Preface to the 1st ECML/PKDD workshop on Statistically Sound Data Mining},
author = {Hämäläinen, Wilhelmiina and Petitjean, François and Webb, Geoffrey I.},
booktitle = {Proceedings of the Workshop on Statistically Sound Data Mining at ECML/PKDD},
pages = {1--2},
year = {2015},
editor = {Hämäläinen, Wilhelmiina and Petitjean, François and Webb, I.},
volume = {47},
series = {Proceedings of Machine Learning Research},
address = {Nancy, France},
month = {15 Sep},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v47/preface.pdf},
url = {https://proceedings.mlr.press/v47/preface.html},
abstract = {These proceedings contain the papers accepted to the 1st ECML/PKDD Workshop on Statistically Sound Data Mining, which took place at the French Institute for Computer Science (INRIA) in Nancy, at the opening of the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML/PKDD) on the 15th of September 2014.}
}
@InProceedings{pmlr-v47-vanwinckelen14a,
title = {Look before you leap: Some insights into learner evaluation with cross-validation},
author = {Vanwinckelen, Gitte and Blockeel, Hendrik},
booktitle = {Proceedings of the Workshop on Statistically Sound Data Mining at ECML/PKDD},
pages = {3--20},
year = {2015},
editor = {Hämäläinen, Wilhelmiina and Petitjean, François and Webb, I.},
volume = {47},
series = {Proceedings of Machine Learning Research},
address = {Nancy, France},
month = {15 Sep},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v47/vanwinckelen14a.pdf},
url = {https://proceedings.mlr.press/v47/vanwinckelen14a.html},
abstract = {Machine learning is largely an experimental science, of which the evaluation of predictive models is an important aspect. These days, cross-validation is the most widely used method for this task. There are, however, a number of important points that should be taken into account when using this methodology. First, one should clearly state what they are trying to estimate. Namely, a distinction should be made between the evaluation of a model learned on a single dataset, and that of a learner trained on a random sample from a given data population. Each of these two questions requires a different statistical approach and should not be confused with each other. While this has been noted before, the literature on this topic is generally not very accessible. This paper tries to give an understandable overview of the statistical aspects of these two evaluation tasks. We also pose that because of the often limited availability of data, and the difficulty of selecting an appropriate statistical test, it is in some cases perhaps better to abstain from statistical testing, and instead focus on an interpretation of the immediate results. }
}
@InProceedings{pmlr-v47-lemmerich14a,
title = {A Critical View on Automatic Significance-Filtering in Pattern Mining},
author = {Lemmerich, Florian and Puppe, Frank},
booktitle = {Proceedings of the Workshop on Statistically Sound Data Mining at ECML/PKDD},
pages = {21--27},
year = {2015},
editor = {Hämäläinen, Wilhelmiina and Petitjean, François and Webb, I.},
volume = {47},
series = {Proceedings of Machine Learning Research},
address = {Nancy, France},
month = {15 Sep},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v47/lemmerich14a.pdf},
url = {https://proceedings.mlr.press/v47/lemmerich14a.html},
abstract = {Statistically sound validation of results plays an important role in modern data mining. In this context, it has been advocated to disregard patterns that cannot be automatically confirmed as statistically valid by the available data. In this short position paper, we argue against a mandatory automatic significance filtering of results.}
}
@InProceedings{pmlr-v47-sese14a,
title = {Statistically significant subgraphs for genome-wide association study},
author = {Sese, Jun and Terada, Aika and Saito, Yuki and Tsuda, Koji},
booktitle = {Proceedings of the Workshop on Statistically Sound Data Mining at ECML/PKDD},
pages = {29--36},
year = {2015},
editor = {Hämäläinen, Wilhelmiina and Petitjean, François and Webb, I.},
volume = {47},
series = {Proceedings of Machine Learning Research},
address = {Nancy, France},
month = {15 Sep},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v47/sese14a.pdf},
url = {https://proceedings.mlr.press/v47/sese14a.html},
abstract = {Genome-wide association studies (GWAS) have been widely used for understanding the associations of single-nucleotide polymorphisms (SNPs) with a disease. GWAS data are often combined with known biological networks, and they have been analyzed using graph-mining techniques toward a systems understanding of the biological changes caused by the SNPs. To determine which subgraphs are associated with the disease, a statistical test on each subgraph needs to be conducted. However, no statistically significant results were found because multiple testing correction causes an extremely small corrected significance level. We introduce a method called gLAMP to enumerate subgraphs having statistically significant associations with a diagnosis. gLAMP integrates the Limitless Arity Multiple-testing Procedure (LAMP) with a graph-mining algorithm called COmmon Itemset Network mining (COIN). LAMP gives us the smallest possible Bonferroni factor, and COIN provides us with efficient enumeration of testable subgraphs. Theoretical results of their combination show the potential to enumerate subgraphs statistically significantly associated with a disease.}
}
@InProceedings{pmlr-v47-wang14a,
title = {U-statistics on network-structured data with kernels of degree larger than one},
author = {Wang, Yuyi and Pelekis, Christos and Ramon, Jan},
booktitle = {Proceedings of the Workshop on Statistically Sound Data Mining at ECML/PKDD},
pages = {37--48},
year = {2015},
editor = {Hämäläinen, Wilhelmiina and Petitjean, François and Webb, I.},
volume = {47},
series = {Proceedings of Machine Learning Research},
address = {Nancy, France},
month = {15 Sep},
publisher = {PMLR},
pdf = {http://proceedings.mlr.press/v47/wang14a.pdf},
url = {https://proceedings.mlr.press/v47/wang14a.html},
abstract = {Most analysis of U-statistics assumes that data points are independent or stationary. However, when we analyze network data, these two assumptions do not hold any more. We first define the problem of weighted U-statistics on networked data by extending previous work. We analyze their variance using Hoeffding’s decomposition and also give exponential concentration inequalities. Two efficiently solvable linear programs are proposed to find estimators with minimum worst-case variance or with tighter concentration inequalities.}
}