@inproceedings{ionescu-popescu-2017-string,
title = "Can string kernels pass the test of time in Native Language Identification?",
author = "Ionescu, Radu Tudor and
Popescu, Marius",
editor = "Tetreault, Joel and
Burstein, Jill and
Leacock, Claudia and
Yannakoudakis, Helen",
booktitle = "Proceedings of the 12th Workshop on Innovative Use of {NLP} for Building Educational Applications",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/W17-5024",
doi = "10.18653/v1/W17-5024",
pages = "224--234",
abstract = "We describe a machine learning approach for the 2017 shared task on Native Language Identification (NLI). The proposed approach combines several kernels using multiple kernel learning. While most of our kernels are based on character p-grams (also known as n-grams) extracted from essays or speech transcripts, we also use a kernel based on i-vectors, a low-dimensional representation of audio recordings, provided by the shared task organizers. For the learning stage, we choose Kernel Discriminant Analysis (KDA) over Kernel Ridge Regression (KRR), because the former classifier obtains better results than the latter one on the development set. In our previous work, we have used a similar machine learning approach to achieve state-of-the-art NLI results. The goal of this paper is to demonstrate that our shallow and simple approach based on string kernels (with minor improvements) can pass the test of time and reach state-of-the-art performance in the 2017 NLI shared task, despite the recent advances in natural language processing. We participated in all three tracks, in which the competitors were allowed to use only the essays (essay track), only the speech transcripts (speech track), or both (fusion track). Using only the data provided by the organizers for training our models, we have reached a macro F1 score of 86.95{\%} in the closed essay track, a macro F1 score of 87.55{\%} in the closed speech track, and a macro F1 score of 93.19{\%} in the closed fusion track. With these scores, our team (UnibucKernel) ranked in the first group of teams in all three tracks, while attaining the best scores in the speech and the fusion tracks.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ionescu-popescu-2017-string">
<titleInfo>
<title>Can string kernels pass the test of time in Native Language Identification?</title>
</titleInfo>
<name type="personal">
<namePart type="given">Radu</namePart>
<namePart type="given">Tudor</namePart>
<namePart type="family">Ionescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Marius</namePart>
<namePart type="family">Popescu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications</title>
</titleInfo>
<name type="personal">
<namePart type="given">Joel</namePart>
<namePart type="family">Tetreault</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jill</namePart>
<namePart type="family">Burstein</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Claudia</namePart>
<namePart type="family">Leacock</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Helen</namePart>
<namePart type="family">Yannakoudakis</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Copenhagen, Denmark</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We describe a machine learning approach for the 2017 shared task on Native Language Identification (NLI). The proposed approach combines several kernels using multiple kernel learning. While most of our kernels are based on character p-grams (also known as n-grams) extracted from essays or speech transcripts, we also use a kernel based on i-vectors, a low-dimensional representation of audio recordings, provided by the shared task organizers. For the learning stage, we choose Kernel Discriminant Analysis (KDA) over Kernel Ridge Regression (KRR), because the former classifier obtains better results than the latter one on the development set. In our previous work, we have used a similar machine learning approach to achieve state-of-the-art NLI results. The goal of this paper is to demonstrate that our shallow and simple approach based on string kernels (with minor improvements) can pass the test of time and reach state-of-the-art performance in the 2017 NLI shared task, despite the recent advances in natural language processing. We participated in all three tracks, in which the competitors were allowed to use only the essays (essay track), only the speech transcripts (speech track), or both (fusion track). Using only the data provided by the organizers for training our models, we have reached a macro F1 score of 86.95% in the closed essay track, a macro F1 score of 87.55% in the closed speech track, and a macro F1 score of 93.19% in the closed fusion track. With these scores, our team (UnibucKernel) ranked in the first group of teams in all three tracks, while attaining the best scores in the speech and the fusion tracks.</abstract>
<identifier type="citekey">ionescu-popescu-2017-string</identifier>
<identifier type="doi">10.18653/v1/W17-5024</identifier>
<location>
<url>https://aclanthology.org/W17-5024</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>224</start>
<end>234</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Can string kernels pass the test of time in Native Language Identification?
%A Ionescu, Radu Tudor
%A Popescu, Marius
%Y Tetreault, Joel
%Y Burstein, Jill
%Y Leacock, Claudia
%Y Yannakoudakis, Helen
%S Proceedings of the 12th Workshop on Innovative Use of NLP for Building Educational Applications
%D 2017
%8 September
%I Association for Computational Linguistics
%C Copenhagen, Denmark
%F ionescu-popescu-2017-string
%X We describe a machine learning approach for the 2017 shared task on Native Language Identification (NLI). The proposed approach combines several kernels using multiple kernel learning. While most of our kernels are based on character p-grams (also known as n-grams) extracted from essays or speech transcripts, we also use a kernel based on i-vectors, a low-dimensional representation of audio recordings, provided by the shared task organizers. For the learning stage, we choose Kernel Discriminant Analysis (KDA) over Kernel Ridge Regression (KRR), because the former classifier obtains better results than the latter one on the development set. In our previous work, we have used a similar machine learning approach to achieve state-of-the-art NLI results. The goal of this paper is to demonstrate that our shallow and simple approach based on string kernels (with minor improvements) can pass the test of time and reach state-of-the-art performance in the 2017 NLI shared task, despite the recent advances in natural language processing. We participated in all three tracks, in which the competitors were allowed to use only the essays (essay track), only the speech transcripts (speech track), or both (fusion track). Using only the data provided by the organizers for training our models, we have reached a macro F1 score of 86.95% in the closed essay track, a macro F1 score of 87.55% in the closed speech track, and a macro F1 score of 93.19% in the closed fusion track. With these scores, our team (UnibucKernel) ranked in the first group of teams in all three tracks, while attaining the best scores in the speech and the fusion tracks.
%R 10.18653/v1/W17-5024
%U https://aclanthology.org/W17-5024
%U https://doi.org/10.18653/v1/W17-5024
%P 224-234
Markdown (Informal)
[Can string kernels pass the test of time in Native Language Identification?](https://aclanthology.org/W17-5024) (Ionescu & Popescu, BEA 2017)
ACL