@article{dror-etal-2017-replicability,
title = "Replicability Analysis for Natural Language Processing: Testing Significance with Multiple Datasets",
author = "Dror, Rotem and
Baumer, Gili and
Bogomolov, Marina and
Reichart, Roi",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina",
journal = "Transactions of the Association for Computational Linguistics",
volume = "5",
year = "2017",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/Q17-1033",
doi = "10.1162/tacl_a_00074",
pages = "471--486",
abstract = "With the ever growing amount of textual data from a large variety of languages, domains, and genres, it has become standard to evaluate NLP algorithms on multiple datasets in order to ensure a consistent performance across heterogeneous setups. However, such multiple comparisons pose significant challenges to traditional statistical analysis methods in NLP and can lead to erroneous conclusions. In this paper we propose a Replicability Analysis framework for a statistically sound analysis of multiple comparisons between algorithms for NLP tasks. We discuss the theoretical advantages of this framework over the current, statistically unjustified, practice in the NLP literature, and demonstrate its empirical value across four applications: multi-domain dependency parsing, multilingual POS tagging, cross-domain sentiment classification and word similarity prediction.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dror-etal-2017-replicability">
<titleInfo>
<title>Replicability Analysis for Natural Language Processing: Testing Significance with Multiple Datasets</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gili</namePart>
<namePart type="family">Baumer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marina</namePart>
<namePart type="family">Bogomolov</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roi</namePart>
<namePart type="family">Reichart</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre authority="bibutilsgt">journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre authority="bibutilsgt">academic journal</genre>
</relatedItem>
<abstract>With the ever growing amount of textual data from a large variety of languages, domains, and genres, it has become standard to evaluate NLP algorithms on multiple datasets in order to ensure a consistent performance across heterogeneous setups. However, such multiple comparisons pose significant challenges to traditional statistical analysis methods in NLP and can lead to erroneous conclusions. In this paper we propose a Replicability Analysis framework for a statistically sound analysis of multiple comparisons between algorithms for NLP tasks. We discuss the theoretical advantages of this framework over the current, statistically unjustified, practice in the NLP literature, and demonstrate its empirical value across four applications: multi-domain dependency parsing, multilingual POS tagging, cross-domain sentiment classification and word similarity prediction.</abstract>
<identifier type="citekey">dror-etal-2017-replicability</identifier>
<identifier type="doi">10.1162/tacl_a_00074</identifier>
<location>
<url>https://aclanthology.org/Q17-1033</url>
</location>
<part>
<date>2017</date>
<detail type="volume"><number>5</number></detail>
<extent unit="page">
<start>471</start>
<end>486</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Replicability Analysis for Natural Language Processing: Testing Significance with Multiple Datasets
%A Dror, Rotem
%A Baumer, Gili
%A Bogomolov, Marina
%A Reichart, Roi
%J Transactions of the Association for Computational Linguistics
%D 2017
%V 5
%I MIT Press
%C Cambridge, MA
%F dror-etal-2017-replicability
%X With the ever growing amount of textual data from a large variety of languages, domains, and genres, it has become standard to evaluate NLP algorithms on multiple datasets in order to ensure a consistent performance across heterogeneous setups. However, such multiple comparisons pose significant challenges to traditional statistical analysis methods in NLP and can lead to erroneous conclusions. In this paper we propose a Replicability Analysis framework for a statistically sound analysis of multiple comparisons between algorithms for NLP tasks. We discuss the theoretical advantages of this framework over the current, statistically unjustified, practice in the NLP literature, and demonstrate its empirical value across four applications: multi-domain dependency parsing, multilingual POS tagging, cross-domain sentiment classification and word similarity prediction.
%R 10.1162/tacl_a_00074
%U https://aclanthology.org/Q17-1033
%U https://doi.org/10.1162/tacl_a_00074
%P 471-486
Markdown (Informal)
[Replicability Analysis for Natural Language Processing: Testing Significance with Multiple Datasets](https://aclanthology.org/Q17-1033) (Dror et al., TACL 2017)
ACL