@inproceedings{funayama-etal-2020-preventing,
title = "Preventing Critical Scoring Errors in Short Answer Scoring with Confidence Estimation",
author = "Funayama, Hiroaki and
Sasaki, Shota and
Matsubayashi, Yuichiroh and
Mizumoto, Tomoya and
Suzuki, Jun and
Mita, Masato and
Inui, Kentaro",
editor = "Rijhwani, Shruti and
Liu, Jiangming and
Wang, Yizhong and
Dror, Rotem",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-srw.32",
doi = "10.18653/v1/2020.acl-srw.32",
pages = "237--243",
abstract = "Many recent Short Answer Scoring (SAS) systems have employed Quadratic Weighted Kappa (QWK) as the evaluation measure of their systems. However, we hypothesize that QWK is unsatisfactory for the evaluation of the SAS systems when we consider measuring their effectiveness in actual usage. We introduce a new task formulation of SAS that matches the actual usage. In our formulation, the SAS systems should extract as many scoring predictions that are not critical scoring errors (CSEs). We conduct the experiments in our new task formulation and demonstrate that a typical SAS system can predict scores with zero CSE for approximately 50{\%} of test data at maximum by filtering out low-reliablility predictions on the basis of a certain confidence estimation. This result directly indicates the possibility of reducing half the scoring cost of human raters, which is more preferable for the evaluation of SAS systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="funayama-etal-2020-preventing">
<titleInfo>
<title>Preventing Critical Scoring Errors in Short Answer Scoring with Confidence Estimation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hiroaki</namePart>
<namePart type="family">Funayama</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Shota</namePart>
<namePart type="family">Sasaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yuichiroh</namePart>
<namePart type="family">Matsubayashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tomoya</namePart>
<namePart type="family">Mizumoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jun</namePart>
<namePart type="family">Suzuki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Masato</namePart>
<namePart type="family">Mita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kentaro</namePart>
<namePart type="family">Inui</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-07</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shruti</namePart>
<namePart type="family">Rijhwani</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiangming</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yizhong</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rotem</namePart>
<namePart type="family">Dror</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Many recent Short Answer Scoring (SAS) systems have employed Quadratic Weighted Kappa (QWK) as the evaluation measure of their systems. However, we hypothesize that QWK is unsatisfactory for the evaluation of the SAS systems when we consider measuring their effectiveness in actual usage. We introduce a new task formulation of SAS that matches the actual usage. In our formulation, the SAS systems should extract as many scoring predictions that are not critical scoring errors (CSEs). We conduct the experiments in our new task formulation and demonstrate that a typical SAS system can predict scores with zero CSE for approximately 50% of test data at maximum by filtering out low-reliablility predictions on the basis of a certain confidence estimation. This result directly indicates the possibility of reducing half the scoring cost of human raters, which is more preferable for the evaluation of SAS systems.</abstract>
<identifier type="citekey">funayama-etal-2020-preventing</identifier>
<identifier type="doi">10.18653/v1/2020.acl-srw.32</identifier>
<location>
<url>https://aclanthology.org/2020.acl-srw.32</url>
</location>
<part>
<date>2020-07</date>
<extent unit="page">
<start>237</start>
<end>243</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Preventing Critical Scoring Errors in Short Answer Scoring with Confidence Estimation
%A Funayama, Hiroaki
%A Sasaki, Shota
%A Matsubayashi, Yuichiroh
%A Mizumoto, Tomoya
%A Suzuki, Jun
%A Mita, Masato
%A Inui, Kentaro
%Y Rijhwani, Shruti
%Y Liu, Jiangming
%Y Wang, Yizhong
%Y Dror, Rotem
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop
%D 2020
%8 July
%I Association for Computational Linguistics
%C Online
%F funayama-etal-2020-preventing
%X Many recent Short Answer Scoring (SAS) systems have employed Quadratic Weighted Kappa (QWK) as the evaluation measure of their systems. However, we hypothesize that QWK is unsatisfactory for the evaluation of the SAS systems when we consider measuring their effectiveness in actual usage. We introduce a new task formulation of SAS that matches the actual usage. In our formulation, the SAS systems should extract as many scoring predictions that are not critical scoring errors (CSEs). We conduct the experiments in our new task formulation and demonstrate that a typical SAS system can predict scores with zero CSE for approximately 50% of test data at maximum by filtering out low-reliablility predictions on the basis of a certain confidence estimation. This result directly indicates the possibility of reducing half the scoring cost of human raters, which is more preferable for the evaluation of SAS systems.
%R 10.18653/v1/2020.acl-srw.32
%U https://aclanthology.org/2020.acl-srw.32
%U https://doi.org/10.18653/v1/2020.acl-srw.32
%P 237-243
Markdown (Informal)
[Preventing Critical Scoring Errors in Short Answer Scoring with Confidence Estimation](https://aclanthology.org/2020.acl-srw.32) (Funayama et al., ACL 2020)
ACL