@inproceedings{iglesias-flores-etal-2021-topgunn,
title = "{T}op{G}u{NN}: Fast {NLP} Training Data Augmentation using Large Corpora",
author = "Iglesias-Flores, Rebecca and
Mishra, Megha and
Patel, Ajay and
Malhotra, Akanksha and
Kriz, Reno and
Palmer, Martha and
Callison-Burch, Chris",
editor = "Dragut, Eduard and
Li, Yunyao and
Popa, Lucian and
Vucetic, Slobodan",
booktitle = "Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances",
month = jun,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.dash-1.14",
doi = "10.18653/v1/2021.dash-1.14",
pages = "86--101",
abstract = "Acquiring training data for natural language processing systems can be expensive and time-consuming. Given a few training examples crafted by experts, large corpora can be mined for thousands of semantically similar examples that provide useful variability to improve model generalization. We present TopGuNN, a fast contextualized k-NN retrieval system that can efficiently index and search over contextual embeddings generated from large corpora. TopGuNN is demonstrated for a training data augmentation use case over the Gigaword corpus. Using approximate k-NN and an efficient architecture, TopGuNN performs queries over an embedding space of 4.63TB (approximately 1.5B embeddings) in less than a day.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="iglesias-flores-etal-2021-topgunn">
<titleInfo>
<title>TopGuNN: Fast NLP Training Data Augmentation using Large Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Iglesias-Flores</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Megha</namePart>
<namePart type="family">Mishra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ajay</namePart>
<namePart type="family">Patel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akanksha</namePart>
<namePart type="family">Malhotra</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Reno</namePart>
<namePart type="family">Kriz</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martha</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chris</namePart>
<namePart type="family">Callison-Burch</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-06</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances</title>
</titleInfo>
<name type="personal">
<namePart type="given">Eduard</namePart>
<namePart type="family">Dragut</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yunyao</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucian</namePart>
<namePart type="family">Popa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Slobodan</namePart>
<namePart type="family">Vucetic</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Acquiring training data for natural language processing systems can be expensive and time-consuming. Given a few training examples crafted by experts, large corpora can be mined for thousands of semantically similar examples that provide useful variability to improve model generalization. We present TopGuNN, a fast contextualized k-NN retrieval system that can efficiently index and search over contextual embeddings generated from large corpora. TopGuNN is demonstrated for a training data augmentation use case over the Gigaword corpus. Using approximate k-NN and an efficient architecture, TopGuNN performs queries over an embedding space of 4.63TB (approximately 1.5B embeddings) in less than a day.</abstract>
<identifier type="citekey">iglesias-flores-etal-2021-topgunn</identifier>
<identifier type="doi">10.18653/v1/2021.dash-1.14</identifier>
<location>
<url>https://aclanthology.org/2021.dash-1.14</url>
</location>
<part>
<date>2021-06</date>
<extent unit="page">
<start>86</start>
<end>101</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T TopGuNN: Fast NLP Training Data Augmentation using Large Corpora
%A Iglesias-Flores, Rebecca
%A Mishra, Megha
%A Patel, Ajay
%A Malhotra, Akanksha
%A Kriz, Reno
%A Palmer, Martha
%A Callison-Burch, Chris
%Y Dragut, Eduard
%Y Li, Yunyao
%Y Popa, Lucian
%Y Vucetic, Slobodan
%S Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances
%D 2021
%8 June
%I Association for Computational Linguistics
%C Online
%F iglesias-flores-etal-2021-topgunn
%X Acquiring training data for natural language processing systems can be expensive and time-consuming. Given a few training examples crafted by experts, large corpora can be mined for thousands of semantically similar examples that provide useful variability to improve model generalization. We present TopGuNN, a fast contextualized k-NN retrieval system that can efficiently index and search over contextual embeddings generated from large corpora. TopGuNN is demonstrated for a training data augmentation use case over the Gigaword corpus. Using approximate k-NN and an efficient architecture, TopGuNN performs queries over an embedding space of 4.63TB (approximately 1.5B embeddings) in less than a day.
%R 10.18653/v1/2021.dash-1.14
%U https://aclanthology.org/2021.dash-1.14
%U https://doi.org/10.18653/v1/2021.dash-1.14
%P 86-101
Markdown (Informal)
[TopGuNN: Fast NLP Training Data Augmentation using Large Corpora](https://aclanthology.org/2021.dash-1.14) (Iglesias-Flores et al., DaSH 2021)
ACL
- Rebecca Iglesias-Flores, Megha Mishra, Ajay Patel, Akanksha Malhotra, Reno Kriz, Martha Palmer, and Chris Callison-Burch. 2021. TopGuNN: Fast NLP Training Data Augmentation using Large Corpora. In Proceedings of the Second Workshop on Data Science with Human in the Loop: Language Advances, pages 86–101, Online. Association for Computational Linguistics.