@inproceedings{bevendorff-etal-2020-crawling,
title = "Crawling and Preprocessing Mailing Lists At Scale for Dialog Analysis",
author = "Bevendorff, Janek and
Al Khatib, Khalid and
Potthast, Martin and
Stein, Benno",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.108/",
doi = "10.18653/v1/2020.acl-main.108",
pages = "1151--1158",
abstract = "This paper introduces the Webis Gmane Email Corpus 2019, the largest publicly available and fully preprocessed email corpus to date. We crawled more than 153 million emails from 14,699 mailing lists and segmented them into semantically consistent components using a new neural segmentation model. With 96{\%} accuracy on 15 classes of email segments, our model achieves state-of-the-art performance while being more efficient to train than previous ones. All data, code, and trained models are made freely available alongside the paper."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="bevendorff-etal-2020-crawling">
<titleInfo>
<title>Crawling and Preprocessing Mailing Lists At Scale for Dialog Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Janek</namePart>
<namePart type="family">Bevendorff</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Al Khatib</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martin</namePart>
<namePart type="family">Potthast</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Benno</namePart>
<namePart type="family">Stein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Dan</namePart>
<namePart type="family">Jurafsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joyce</namePart>
<namePart type="family">Chai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Natalie</namePart>
<namePart type="family">Schluter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper introduces the Webis Gmane Email Corpus 2019, the largest publicly available and fully preprocessed email corpus to date. We crawled more than 153 million emails from 14,699 mailing lists and segmented them into semantically consistent components using a new neural segmentation model. With 96% accuracy on 15 classes of email segments, our model achieves state-of-the-art performance while being more efficient to train than previous ones. All data, code, and trained models are made freely available alongside the paper.</abstract>
<identifier type="citekey">bevendorff-etal-2020-crawling</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.108</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.108/</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>1151</start>
<end>1158</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Crawling and Preprocessing Mailing Lists At Scale for Dialog Analysis
%A Bevendorff, Janek
%A Al Khatib, Khalid
%A Potthast, Martin
%A Stein, Benno
%Y Jurafsky, Dan
%Y Chai, Joyce
%Y Schluter, Natalie
%Y Tetreault, Joel
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F bevendorff-etal-2020-crawling
%X This paper introduces the Webis Gmane Email Corpus 2019, the largest publicly available and fully preprocessed email corpus to date. We crawled more than 153 million emails from 14,699 mailing lists and segmented them into semantically consistent components using a new neural segmentation model. With 96% accuracy on 15 classes of email segments, our model achieves state-of-the-art performance while being more efficient to train than previous ones. All data, code, and trained models are made freely available alongside the paper.
%R 10.18653/v1/2020.acl-main.108
%U https://aclanthology.org/2020.acl-main.108/
%U https://doi.org/10.18653/v1/2020.acl-main.108
%P 1151-1158
Markdown (Informal)
[Crawling and Preprocessing Mailing Lists At Scale for Dialog Analysis](https://aclanthology.org/2020.acl-main.108/) (Bevendorff et al., ACL 2020)
- Crawling and Preprocessing Mailing Lists At Scale for Dialog Analysis (Bevendorff et al., ACL 2020)
ACL
- Janek Bevendorff, Khalid Al Khatib, Martin Potthast, and Benno Stein. 2020. Crawling and Preprocessing Mailing Lists At Scale for Dialog Analysis. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 1151–1158, Online. Association for Computational Linguistics.