@inproceedings{varga-etal-2012-unsupervised,
title = "Unsupervised document zone identification using probabilistic graphical models",
author = "Varga, Andrea and
Preo{\c{t}}iuc-Pietro, Daniel and
Ciravegna, Fabio",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Do{\u{g}}an, Mehmet U{\u{g}}ur and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/881_Paper.pdf",
pages = "1610--1617",
abstract = "Document zone identification aims to automatically classify sequences of text-spans (e.g. sentences) within a document into predefined zone categories. Current approaches to document zone identification mostly rely on supervised machine learning methods, which require a large amount of annotated data, which is often difficult and expensive to obtain. In order to overcome this bottleneck, we propose graphical models based on the popular Latent Dirichlet Allocation (LDA) model. The first model, which we call zoneLDA aims to cluster the sentences into zone classes using only unlabelled data. We also study an extension of zoneLDA called zoneLDAb, which makes distinction between common words and non-common words within the different zone types. We present results on two different domains: the scientific domain and the technical domain. For the latter one we propose a new document zone classification schema, which has been annotated over a collection of 689 documents, achieving a Kappa score of 85{\%}. Overall our experiments show promising results for both of the domains, outperforming the baseline model. Furthermore, on the technical domain the performance of the models are comparable to the supervised approach using the same feature sets. We thus believe that graphical models are a promising avenue of research for automatic document zoning.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="varga-etal-2012-unsupervised">
<titleInfo>
<title>Unsupervised document zone identification using probabilistic graphical models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andrea</namePart>
<namePart type="family">Varga</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Preoţiuc-Pietro</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fabio</namePart>
<namePart type="family">Ciravegna</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Thierry</namePart>
<namePart type="family">Declerck</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mehmet</namePart>
<namePart type="given">Uğur</namePart>
<namePart type="family">Doğan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Asuncion</namePart>
<namePart type="family">Moreno</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Stelios</namePart>
<namePart type="family">Piperidis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Document zone identification aims to automatically classify sequences of text-spans (e.g. sentences) within a document into predefined zone categories. Current approaches to document zone identification mostly rely on supervised machine learning methods, which require a large amount of annotated data, which is often difficult and expensive to obtain. In order to overcome this bottleneck, we propose graphical models based on the popular Latent Dirichlet Allocation (LDA) model. The first model, which we call zoneLDA aims to cluster the sentences into zone classes using only unlabelled data. We also study an extension of zoneLDA called zoneLDAb, which makes distinction between common words and non-common words within the different zone types. We present results on two different domains: the scientific domain and the technical domain. For the latter one we propose a new document zone classification schema, which has been annotated over a collection of 689 documents, achieving a Kappa score of 85%. Overall our experiments show promising results for both of the domains, outperforming the baseline model. Furthermore, on the technical domain the performance of the models are comparable to the supervised approach using the same feature sets. We thus believe that graphical models are a promising avenue of research for automatic document zoning.</abstract>
<identifier type="citekey">varga-etal-2012-unsupervised</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/881_Paper.pdf</url>
</location>
<part>
<date>2012-05</date>
<extent unit="page">
<start>1610</start>
<end>1617</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Unsupervised document zone identification using probabilistic graphical models
%A Varga, Andrea
%A Preoţiuc-Pietro, Daniel
%A Ciravegna, Fabio
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Declerck, Thierry
%Y Doğan, Mehmet Uğur
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Moreno, Asuncion
%Y Odijk, Jan
%Y Piperidis, Stelios
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 May
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F varga-etal-2012-unsupervised
%X Document zone identification aims to automatically classify sequences of text-spans (e.g. sentences) within a document into predefined zone categories. Current approaches to document zone identification mostly rely on supervised machine learning methods, which require a large amount of annotated data, which is often difficult and expensive to obtain. In order to overcome this bottleneck, we propose graphical models based on the popular Latent Dirichlet Allocation (LDA) model. The first model, which we call zoneLDA aims to cluster the sentences into zone classes using only unlabelled data. We also study an extension of zoneLDA called zoneLDAb, which makes distinction between common words and non-common words within the different zone types. We present results on two different domains: the scientific domain and the technical domain. For the latter one we propose a new document zone classification schema, which has been annotated over a collection of 689 documents, achieving a Kappa score of 85%. Overall our experiments show promising results for both of the domains, outperforming the baseline model. Furthermore, on the technical domain the performance of the models are comparable to the supervised approach using the same feature sets. We thus believe that graphical models are a promising avenue of research for automatic document zoning.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/881_Paper.pdf
%P 1610-1617
Markdown (Informal)
[Unsupervised document zone identification using probabilistic graphical models](http://www.lrec-conf.org/proceedings/lrec2012/pdf/881_Paper.pdf) (Varga et al., LREC 2012)
ACL