@inproceedings{van-den-bosch-etal-2006-transferring,
title = "Transferring {P}o{S}-tagging and lemmatization tools from spoken to written {D}utch corpus development",
author = "van den Bosch, Antal and
Schuurman, Ineke and
Vandeghinste, Vincent",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Gangemi, Aldo and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Tapias, Daniel",
booktitle = "Proceedings of the Fifth International Conference on Language Resources and Evaluation ({LREC}{'}06)",
month = may,
year = "2006",
address = "Genoa, Italy",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2006/pdf/167_pdf.pdf",
abstract = "We describe a case study in the reuse and transfer of tools in language resource development, from a corpus of spoken Dutch to a corpus of written Dutch. Once tools for a particular language have been developed, it is logical, but not trivial to reuse them for other types or registers of the language than the tools were originally designed for. This paper reviews the decisions and adaptations necessary to make this particular transfer from spoken to written language, focusing on a part-of-speech tagger and a lemmatizer. While the lemmatizer can be transferred fairly straightforwardly, the tagger needs to be adaptated considerably. We show how it can be adapted without starting from scratch. We describe how the part-of-speech tagset was adapted and how the tagger was retrained to deal with written-text phenomena it had not been trained on earlier.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="van-den-bosch-etal-2006-transferring">
<titleInfo>
<title>Transferring PoS-tagging and lemmatization tools from spoken to written Dutch corpus development</title>
</titleInfo>
<name type="personal">
<namePart type="given">Antal</namePart>
<namePart type="family">van den Bosch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ineke</namePart>
<namePart type="family">Schuurman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Vincent</namePart>
<namePart type="family">Vandeghinste</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2006-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Khalid</namePart>
<namePart type="family">Choukri</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Aldo</namePart>
<namePart type="family">Gangemi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bente</namePart>
<namePart type="family">Maegaard</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Joseph</namePart>
<namePart type="family">Mariani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jan</namePart>
<namePart type="family">Odijk</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Daniel</namePart>
<namePart type="family">Tapias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Genoa, Italy</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe a case study in the reuse and transfer of tools in language resource development, from a corpus of spoken Dutch to a corpus of written Dutch. Once tools for a particular language have been developed, it is logical, but not trivial to reuse them for other types or registers of the language than the tools were originally designed for. This paper reviews the decisions and adaptations necessary to make this particular transfer from spoken to written language, focusing on a part-of-speech tagger and a lemmatizer. While the lemmatizer can be transferred fairly straightforwardly, the tagger needs to be adaptated considerably. We show how it can be adapted without starting from scratch. We describe how the part-of-speech tagset was adapted and how the tagger was retrained to deal with written-text phenomena it had not been trained on earlier.</abstract>
<identifier type="citekey">van-den-bosch-etal-2006-transferring</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2006/pdf/167_pdf.pdf</url>
</location>
<part>
<date>2006-05</date>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Transferring PoS-tagging and lemmatization tools from spoken to written Dutch corpus development
%A van den Bosch, Antal
%A Schuurman, Ineke
%A Vandeghinste, Vincent
%Y Calzolari, Nicoletta
%Y Choukri, Khalid
%Y Gangemi, Aldo
%Y Maegaard, Bente
%Y Mariani, Joseph
%Y Odijk, Jan
%Y Tapias, Daniel
%S Proceedings of the Fifth International Conference on Language Resources and Evaluation (LREC’06)
%D 2006
%8 May
%I European Language Resources Association (ELRA)
%C Genoa, Italy
%F van-den-bosch-etal-2006-transferring
%X We describe a case study in the reuse and transfer of tools in language resource development, from a corpus of spoken Dutch to a corpus of written Dutch. Once tools for a particular language have been developed, it is logical, but not trivial to reuse them for other types or registers of the language than the tools were originally designed for. This paper reviews the decisions and adaptations necessary to make this particular transfer from spoken to written language, focusing on a part-of-speech tagger and a lemmatizer. While the lemmatizer can be transferred fairly straightforwardly, the tagger needs to be adaptated considerably. We show how it can be adapted without starting from scratch. We describe how the part-of-speech tagset was adapted and how the tagger was retrained to deal with written-text phenomena it had not been trained on earlier.
%U http://www.lrec-conf.org/proceedings/lrec2006/pdf/167_pdf.pdf
Markdown (Informal)
[Transferring PoS-tagging and lemmatization tools from spoken to written Dutch corpus development](http://www.lrec-conf.org/proceedings/lrec2006/pdf/167_pdf.pdf) (van den Bosch et al., LREC 2006)
ACL