@inproceedings{gella-etal-2017-image,
title = "Image Pivoting for Learning Multilingual Multimodal Representations",
author = "Gella, Spandana and
Sennrich, Rico and
Keller, Frank and
Lapata, Mirella",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/D17-1303/",
doi = "10.18653/v1/D17-1303",
pages = "2839--2845",
abstract = "In this paper we propose a model to learn multimodal multilingual representations for matching images and sentences in different languages, with the aim of advancing multilingual versions of image search and image understanding. Our model learns a common representation for images and their descriptions in two different languages (which need not be parallel) by considering the image as a pivot between two languages. We introduce a new pairwise ranking loss function which can handle both symmetric and asymmetric similarity between the two modalities. We evaluate our models on image-description ranking for German and English, and on semantic textual similarity of image descriptions in English. In both cases we achieve state-of-the-art performance."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="gella-etal-2017-image">
<titleInfo>
<title>Image Pivoting for Learning Multilingual Multimodal Representations</title>
</titleInfo>
<name type="personal">
<namePart type="given">Spandana</namePart>
<namePart type="family">Gella</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rico</namePart>
<namePart type="family">Sennrich</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Frank</namePart>
<namePart type="family">Keller</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mirella</namePart>
<namePart type="family">Lapata</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-09</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Martha</namePart>
<namePart type="family">Palmer</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rebecca</namePart>
<namePart type="family">Hwa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sebastian</namePart>
<namePart type="family">Riedel</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Copenhagen, Denmark</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper we propose a model to learn multimodal multilingual representations for matching images and sentences in different languages, with the aim of advancing multilingual versions of image search and image understanding. Our model learns a common representation for images and their descriptions in two different languages (which need not be parallel) by considering the image as a pivot between two languages. We introduce a new pairwise ranking loss function which can handle both symmetric and asymmetric similarity between the two modalities. We evaluate our models on image-description ranking for German and English, and on semantic textual similarity of image descriptions in English. In both cases we achieve state-of-the-art performance.</abstract>
<identifier type="citekey">gella-etal-2017-image</identifier>
<identifier type="doi">10.18653/v1/D17-1303</identifier>
<location>
<url>https://aclanthology.org/D17-1303/</url>
</location>
<part>
<date>2017-09</date>
<extent unit="page">
<start>2839</start>
<end>2845</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Image Pivoting for Learning Multilingual Multimodal Representations
%A Gella, Spandana
%A Sennrich, Rico
%A Keller, Frank
%A Lapata, Mirella
%Y Palmer, Martha
%Y Hwa, Rebecca
%Y Riedel, Sebastian
%S Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing
%D 2017
%8 September
%I Association for Computational Linguistics
%C Copenhagen, Denmark
%F gella-etal-2017-image
%X In this paper we propose a model to learn multimodal multilingual representations for matching images and sentences in different languages, with the aim of advancing multilingual versions of image search and image understanding. Our model learns a common representation for images and their descriptions in two different languages (which need not be parallel) by considering the image as a pivot between two languages. We introduce a new pairwise ranking loss function which can handle both symmetric and asymmetric similarity between the two modalities. We evaluate our models on image-description ranking for German and English, and on semantic textual similarity of image descriptions in English. In both cases we achieve state-of-the-art performance.
%R 10.18653/v1/D17-1303
%U https://aclanthology.org/D17-1303/
%U https://doi.org/10.18653/v1/D17-1303
%P 2839-2845
Markdown (Informal)
[Image Pivoting for Learning Multilingual Multimodal Representations](https://aclanthology.org/D17-1303/) (Gella et al., EMNLP 2017)
- Image Pivoting for Learning Multilingual Multimodal Representations (Gella et al., EMNLP 2017)
ACL
- Spandana Gella, Rico Sennrich, Frank Keller, and Mirella Lapata. 2017. Image Pivoting for Learning Multilingual Multimodal Representations. In Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing, pages 2839–2845, Copenhagen, Denmark. Association for Computational Linguistics.