@inproceedings{hoory-etal-2021-learning-evaluating,
title = "Learning and Evaluating a Differentially Private Pre-trained Language Model",
author = "Hoory, Shlomo and
Feder, Amir and
Tendler, Avichai and
Erell, Sofia and
Peled-Cohen, Alon and
Laish, Itay and
Nakhost, Hootan and
Stemmer, Uri and
Benjamini, Ayelet and
Hassidim, Avinatan and
Matias, Yossi",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.findings-emnlp.102",
doi = "10.18653/v1/2021.findings-emnlp.102",
pages = "1178--1189",
abstract = "Contextual language models have led to significantly better results, especially when pre-trained on the same data as the downstream task. While this additional pre-training usually improves performance, it can lead to information leakage and therefore risks the privacy of individuals mentioned in the training data. One method to guarantee the privacy of such individuals is to train a differentially-private language model, but this usually comes at the expense of model performance. Also, in the absence of a differentially private vocabulary training, it is not possible to modify the vocabulary to fit the new data, which might further degrade results. In this work we bridge these gaps, and provide guidance to future researchers and practitioners on how to improve privacy while maintaining good model performance. We introduce a novel differentially private word-piece algorithm, which allows training a tailored domain-specific vocabulary while maintaining privacy. We then experiment with entity extraction tasks from clinical notes, and demonstrate how to train a differentially private pre-trained language model (i.e., BERT) with a privacy guarantee of $\epsilon=1.1$ and with only a small degradation in performance. Finally, as it is hard to tell given a privacy parameter $\epsilon$ what was the effect on the trained representation, we present experiments showing that the trained model does not memorize private information.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hoory-etal-2021-learning-evaluating">
<titleInfo>
<title>Learning and Evaluating a Differentially Private Pre-trained Language Model</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shlomo</namePart>
<namePart type="family">Hoory</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Amir</namePart>
<namePart type="family">Feder</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avichai</namePart>
<namePart type="family">Tendler</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sofia</namePart>
<namePart type="family">Erell</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Alon</namePart>
<namePart type="family">Peled-Cohen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Itay</namePart>
<namePart type="family">Laish</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hootan</namePart>
<namePart type="family">Nakhost</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Uri</namePart>
<namePart type="family">Stemmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ayelet</namePart>
<namePart type="family">Benjamini</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Avinatan</namePart>
<namePart type="family">Hassidim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yossi</namePart>
<namePart type="family">Matias</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2021</title>
</titleInfo>
<name type="personal">
<namePart type="given">Marie-Francine</namePart>
<namePart type="family">Moens</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Xuanjing</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Specia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Scott</namePart>
<namePart type="given">Wen-tau</namePart>
<namePart type="family">Yih</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Punta Cana, Dominican Republic</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Contextual language models have led to significantly better results, especially when pre-trained on the same data as the downstream task. While this additional pre-training usually improves performance, it can lead to information leakage and therefore risks the privacy of individuals mentioned in the training data. One method to guarantee the privacy of such individuals is to train a differentially-private language model, but this usually comes at the expense of model performance. Also, in the absence of a differentially private vocabulary training, it is not possible to modify the vocabulary to fit the new data, which might further degrade results. In this work we bridge these gaps, and provide guidance to future researchers and practitioners on how to improve privacy while maintaining good model performance. We introduce a novel differentially private word-piece algorithm, which allows training a tailored domain-specific vocabulary while maintaining privacy. We then experiment with entity extraction tasks from clinical notes, and demonstrate how to train a differentially private pre-trained language model (i.e., BERT) with a privacy guarantee of ε=1.1 and with only a small degradation in performance. Finally, as it is hard to tell given a privacy parameter ε what was the effect on the trained representation, we present experiments showing that the trained model does not memorize private information.</abstract>
<identifier type="citekey">hoory-etal-2021-learning-evaluating</identifier>
<identifier type="doi">10.18653/v1/2021.findings-emnlp.102</identifier>
<location>
<url>https://aclanthology.org/2021.findings-emnlp.102</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>1178</start>
<end>1189</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Learning and Evaluating a Differentially Private Pre-trained Language Model
%A Hoory, Shlomo
%A Feder, Amir
%A Tendler, Avichai
%A Erell, Sofia
%A Peled-Cohen, Alon
%A Laish, Itay
%A Nakhost, Hootan
%A Stemmer, Uri
%A Benjamini, Ayelet
%A Hassidim, Avinatan
%A Matias, Yossi
%Y Moens, Marie-Francine
%Y Huang, Xuanjing
%Y Specia, Lucia
%Y Yih, Scott Wen-tau
%S Findings of the Association for Computational Linguistics: EMNLP 2021
%D 2021
%8 November
%I Association for Computational Linguistics
%C Punta Cana, Dominican Republic
%F hoory-etal-2021-learning-evaluating
%X Contextual language models have led to significantly better results, especially when pre-trained on the same data as the downstream task. While this additional pre-training usually improves performance, it can lead to information leakage and therefore risks the privacy of individuals mentioned in the training data. One method to guarantee the privacy of such individuals is to train a differentially-private language model, but this usually comes at the expense of model performance. Also, in the absence of a differentially private vocabulary training, it is not possible to modify the vocabulary to fit the new data, which might further degrade results. In this work we bridge these gaps, and provide guidance to future researchers and practitioners on how to improve privacy while maintaining good model performance. We introduce a novel differentially private word-piece algorithm, which allows training a tailored domain-specific vocabulary while maintaining privacy. We then experiment with entity extraction tasks from clinical notes, and demonstrate how to train a differentially private pre-trained language model (i.e., BERT) with a privacy guarantee of ε=1.1 and with only a small degradation in performance. Finally, as it is hard to tell given a privacy parameter ε what was the effect on the trained representation, we present experiments showing that the trained model does not memorize private information.
%R 10.18653/v1/2021.findings-emnlp.102
%U https://aclanthology.org/2021.findings-emnlp.102
%U https://doi.org/10.18653/v1/2021.findings-emnlp.102
%P 1178-1189
Markdown (Informal)
[Learning and Evaluating a Differentially Private Pre-trained Language Model](https://aclanthology.org/2021.findings-emnlp.102) (Hoory et al., Findings 2021)
ACL
- Shlomo Hoory, Amir Feder, Avichai Tendler, Sofia Erell, Alon Peled-Cohen, Itay Laish, Hootan Nakhost, Uri Stemmer, Ayelet Benjamini, Avinatan Hassidim, and Yossi Matias. 2021. Learning and Evaluating a Differentially Private Pre-trained Language Model. In Findings of the Association for Computational Linguistics: EMNLP 2021, pages 1178–1189, Punta Cana, Dominican Republic. Association for Computational Linguistics.