@inproceedings{samy-etal-2020-legal,
title = "Legal-{ES}: A Set of Large Scale Resources for {S}panish Legal Text Processing",
author = "Samy, Doaa and
Arenas-Garc{\'\i}a, Jer{\'o}nimo and
P{\'e}rez-Fern{\'a}ndez, David",
editor = "Samy, Doaa and
P{\'e}rez-Fern{\'a}ndez, David and
Arenas-Garc{\'\i}a, Jer{\'o}nimo",
booktitle = "Proceedings of the 1st Workshop on Language Technologies for Government and Public Administration (LT4Gov)",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lt4gov-1.6",
pages = "32--36",
abstract = "Legal-ES is an open source resource kit for legal Spanish. It consists of a large scale Spanish corpus of open legal texts and different kinds of language models including word embeddings and topic models. The corpus includes over 1000 million words covering a collection of legislative and administrative open access documents in Spanish from different sources representing international, national and regional entities. The corpus is pre-processed and tokenized using Spacy. For the word embeddings, gensim was used on the collection of tokens, producing a representation space that is especially suited to reflect the inherent characteristics of the legal domain. We calculate also topic models to obtain a convenient tool to understand the main topics in the corpus and to navigate through the documents exploiting the semantic similarity among documents. We will analyse the time structure of a dynamic topic model to infer changes in the legal production of Spanish jurisdiction that have occurred over the analysed time framework.",
language = "English",
ISBN = "979-10-95546-62-7",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samy-etal-2020-legal">
<titleInfo>
<title>Legal-ES: A Set of Large Scale Resources for Spanish Legal Text Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Doaa</namePart>
<namePart type="family">Samy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jerónimo</namePart>
<namePart type="family">Arenas-García</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Pérez-Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-05</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 1st Workshop on Language Technologies for Government and Public Administration (LT4Gov)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Doaa</namePart>
<namePart type="family">Samy</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Pérez-Fernández</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jerónimo</namePart>
<namePart type="family">Arenas-García</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-62-7</identifier>
</relatedItem>
<abstract>Legal-ES is an open source resource kit for legal Spanish. It consists of a large scale Spanish corpus of open legal texts and different kinds of language models including word embeddings and topic models. The corpus includes over 1000 million words covering a collection of legislative and administrative open access documents in Spanish from different sources representing international, national and regional entities. The corpus is pre-processed and tokenized using Spacy. For the word embeddings, gensim was used on the collection of tokens, producing a representation space that is especially suited to reflect the inherent characteristics of the legal domain. We calculate also topic models to obtain a convenient tool to understand the main topics in the corpus and to navigate through the documents exploiting the semantic similarity among documents. We will analyse the time structure of a dynamic topic model to infer changes in the legal production of Spanish jurisdiction that have occurred over the analysed time framework.</abstract>
<identifier type="citekey">samy-etal-2020-legal</identifier>
<location>
<url>https://aclanthology.org/2020.lt4gov-1.6</url>
</location>
<part>
<date>2020-05</date>
<extent unit="page">
<start>32</start>
<end>36</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Legal-ES: A Set of Large Scale Resources for Spanish Legal Text Processing
%A Samy, Doaa
%A Arenas-García, Jerónimo
%A Pérez-Fernández, David
%Y Samy, Doaa
%Y Pérez-Fernández, David
%Y Arenas-García, Jerónimo
%S Proceedings of the 1st Workshop on Language Technologies for Government and Public Administration (LT4Gov)
%D 2020
%8 May
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-62-7
%G English
%F samy-etal-2020-legal
%X Legal-ES is an open source resource kit for legal Spanish. It consists of a large scale Spanish corpus of open legal texts and different kinds of language models including word embeddings and topic models. The corpus includes over 1000 million words covering a collection of legislative and administrative open access documents in Spanish from different sources representing international, national and regional entities. The corpus is pre-processed and tokenized using Spacy. For the word embeddings, gensim was used on the collection of tokens, producing a representation space that is especially suited to reflect the inherent characteristics of the legal domain. We calculate also topic models to obtain a convenient tool to understand the main topics in the corpus and to navigate through the documents exploiting the semantic similarity among documents. We will analyse the time structure of a dynamic topic model to infer changes in the legal production of Spanish jurisdiction that have occurred over the analysed time framework.
%U https://aclanthology.org/2020.lt4gov-1.6
%P 32-36
Markdown (Informal)
[Legal-ES: A Set of Large Scale Resources for Spanish Legal Text Processing](https://aclanthology.org/2020.lt4gov-1.6) (Samy et al., LT4Gov 2020)
ACL