@inproceedings{zhang-etal-2021-bstc,
title = "{BSTC}: A Large-Scale {C}hinese-{E}nglish Speech Translation Dataset",
author = "Zhang, Ruiqing and
Wang, Xiyang and
Zhang, Chuanqiang and
He, Zhongjun and
Wu, Hua and
Li, Zhi and
Wang, Haifeng and
Chen, Ying and
Li, Qinfei",
editor = "Wu, Hua and
Cherry, Colin and
Huang, Liang and
He, Zhongjun and
Liu, Qun and
Elbayad, Maha and
Liberman, Mark and
Wang, Haifeng and
Ma, Mingbo and
Zhang, Ruiqing",
booktitle = "Proceedings of the Second Workshop on Automatic Simultaneous Translation",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.autosimtrans-1.5",
doi = "10.18653/v1/2021.autosimtrans-1.5",
pages = "28--35",
abstract = "This paper presents BSTC (Baidu Speech Translation Corpus), a large-scale Chinese-English speech translation dataset. This dataset is constructed based on a collection of licensed videos of talks or lectures, including about 68 hours of Mandarin data, their manual transcripts and translations into English, as well as automated transcripts by an automatic speech recognition (ASR) model. We have further asked three experienced interpreters to simultaneously interpret the testing talks in a mock conference setting. This corpus is expected to promote the research of automatic simultaneous translation as well as the development of practical systems. We have organized simultaneous translation tasks and used this corpus to evaluate automatic simultaneous translation systems.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="zhang-etal-2021-bstc">
<titleInfo>
<title>BSTC: A Large-Scale Chinese-English Speech Translation Dataset</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ruiqing</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xiyang</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chuanqiang</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongjun</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hua</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhi</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haifeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ying</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qinfei</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Automatic Simultaneous Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hua</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Colin</namePart>
<namePart type="family">Cherry</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liang</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhongjun</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Qun</namePart>
<namePart type="family">Liu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Maha</namePart>
<namePart type="family">Elbayad</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mark</namePart>
<namePart type="family">Liberman</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Haifeng</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mingbo</namePart>
<namePart type="family">Ma</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ruiqing</namePart>
<namePart type="family">Zhang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This paper presents BSTC (Baidu Speech Translation Corpus), a large-scale Chinese-English speech translation dataset. This dataset is constructed based on a collection of licensed videos of talks or lectures, including about 68 hours of Mandarin data, their manual transcripts and translations into English, as well as automated transcripts by an automatic speech recognition (ASR) model. We have further asked three experienced interpreters to simultaneously interpret the testing talks in a mock conference setting. This corpus is expected to promote the research of automatic simultaneous translation as well as the development of practical systems. We have organized simultaneous translation tasks and used this corpus to evaluate automatic simultaneous translation systems.</abstract>
<identifier type="citekey">zhang-etal-2021-bstc</identifier>
<identifier type="doi">10.18653/v1/2021.autosimtrans-1.5</identifier>
<location>
<url>https://aclanthology.org/2021.autosimtrans-1.5</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>28</start>
<end>35</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T BSTC: A Large-Scale Chinese-English Speech Translation Dataset
%A Zhang, Ruiqing
%A Wang, Xiyang
%A Zhang, Chuanqiang
%A He, Zhongjun
%A Wu, Hua
%A Li, Zhi
%A Wang, Haifeng
%A Chen, Ying
%A Li, Qinfei
%Y Wu, Hua
%Y Cherry, Colin
%Y Huang, Liang
%Y He, Zhongjun
%Y Liu, Qun
%Y Elbayad, Maha
%Y Liberman, Mark
%Y Wang, Haifeng
%Y Ma, Mingbo
%Y Zhang, Ruiqing
%S Proceedings of the Second Workshop on Automatic Simultaneous Translation
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F zhang-etal-2021-bstc
%X This paper presents BSTC (Baidu Speech Translation Corpus), a large-scale Chinese-English speech translation dataset. This dataset is constructed based on a collection of licensed videos of talks or lectures, including about 68 hours of Mandarin data, their manual transcripts and translations into English, as well as automated transcripts by an automatic speech recognition (ASR) model. We have further asked three experienced interpreters to simultaneously interpret the testing talks in a mock conference setting. This corpus is expected to promote the research of automatic simultaneous translation as well as the development of practical systems. We have organized simultaneous translation tasks and used this corpus to evaluate automatic simultaneous translation systems.
%R 10.18653/v1/2021.autosimtrans-1.5
%U https://aclanthology.org/2021.autosimtrans-1.5
%U https://doi.org/10.18653/v1/2021.autosimtrans-1.5
%P 28-35
Markdown (Informal)
[BSTC: A Large-Scale Chinese-English Speech Translation Dataset](https://aclanthology.org/2021.autosimtrans-1.5) (Zhang et al., AutoSimTrans 2021)
ACL
- Ruiqing Zhang, Xiyang Wang, Chuanqiang Zhang, Zhongjun He, Hua Wu, Zhi Li, Haifeng Wang, Ying Chen, and Qinfei Li. 2021. BSTC: A Large-Scale Chinese-English Speech Translation Dataset. In Proceedings of the Second Workshop on Automatic Simultaneous Translation, pages 28–35, Online. Association for Computational Linguistics.