@inproceedings{luo-etal-2022-doc,
title = "Doc-{GCN}: Heterogeneous Graph Convolutional Networks for Document Layout Analysis",
author = "Luo, Siwen and
Ding, Yihao and
Long, Siqu and
Poon, Josiah and
Han, Soyeon Caren",
editor = "Calzolari, Nicoletta and
Huang, Chu-Ren and
Kim, Hansaem and
Pustejovsky, James and
Wanner, Leo and
Choi, Key-Sun and
Ryu, Pum-Mo and
Chen, Hsin-Hsi and
Donatelli, Lucia and
Ji, Heng and
Kurohashi, Sadao and
Paggio, Patrizia and
Xue, Nianwen and
Kim, Seokhwan and
Hahm, Younggyun and
He, Zhong and
Lee, Tony Kyungil and
Santus, Enrico and
Bond, Francis and
Na, Seung-Hoon",
booktitle = "Proceedings of the 29th International Conference on Computational Linguistics",
month = oct,
year = "2022",
address = "Gyeongju, Republic of Korea",
publisher = "International Committee on Computational Linguistics",
url = "https://aclanthology.org/2022.coling-1.256",
pages = "2906--2916",
abstract = "Recognizing the layout of unstructured digital documents is crucial when parsing the documents into the structured, machine-readable format for downstream applications. Recent studies in Document Layout Analysis usually rely on visual cues to understand documents while ignoring other information, such as contextual information or the relationships between document layout components, which are vital to boost better layout analysis performance. Our Doc-GCN presents an effective way to harmonize and integrate heterogeneous aspects for Document Layout Analysis. We construct different graphs to capture the four main features aspects of document layout components, including syntactic, semantic, density, and appearance features. Then, we apply graph convolutional networks to enhance each aspect of features and apply the node-level pooling for integration. Finally, we concatenate features of all aspects and feed them into the 2-layer MLPs for document layout component classification. Our Doc-GCN achieves state-of-the-art results on three widely used DLA datasets: PubLayNet, FUNSD, and DocBank. The code will be released at \url{https://github.com/adlnlp/doc_gcn}",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="luo-etal-2022-doc">
<titleInfo>
<title>Doc-GCN: Heterogeneous Graph Convolutional Networks for Document Layout Analysis</title>
</titleInfo>
<name type="personal">
<namePart type="given">Siwen</namePart>
<namePart type="family">Luo</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yihao</namePart>
<namePart type="family">Ding</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Siqu</namePart>
<namePart type="family">Long</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Josiah</namePart>
<namePart type="family">Poon</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Soyeon</namePart>
<namePart type="given">Caren</namePart>
<namePart type="family">Han</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2022-10</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 29th International Conference on Computational Linguistics</title>
</titleInfo>
<name type="personal">
<namePart type="given">Nicoletta</namePart>
<namePart type="family">Calzolari</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chu-Ren</namePart>
<namePart type="family">Huang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hansaem</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">James</namePart>
<namePart type="family">Pustejovsky</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Leo</namePart>
<namePart type="family">Wanner</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Key-Sun</namePart>
<namePart type="family">Choi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pum-Mo</namePart>
<namePart type="family">Ryu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hsin-Hsi</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lucia</namePart>
<namePart type="family">Donatelli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Heng</namePart>
<namePart type="family">Ji</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Sadao</namePart>
<namePart type="family">Kurohashi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Patrizia</namePart>
<namePart type="family">Paggio</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Nianwen</namePart>
<namePart type="family">Xue</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seokhwan</namePart>
<namePart type="family">Kim</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Younggyun</namePart>
<namePart type="family">Hahm</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Zhong</namePart>
<namePart type="family">He</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tony</namePart>
<namePart type="given">Kyungil</namePart>
<namePart type="family">Lee</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Enrico</namePart>
<namePart type="family">Santus</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Francis</namePart>
<namePart type="family">Bond</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Seung-Hoon</namePart>
<namePart type="family">Na</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>International Committee on Computational Linguistics</publisher>
<place>
<placeTerm type="text">Gyeongju, Republic of Korea</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Recognizing the layout of unstructured digital documents is crucial when parsing the documents into the structured, machine-readable format for downstream applications. Recent studies in Document Layout Analysis usually rely on visual cues to understand documents while ignoring other information, such as contextual information or the relationships between document layout components, which are vital to boost better layout analysis performance. Our Doc-GCN presents an effective way to harmonize and integrate heterogeneous aspects for Document Layout Analysis. We construct different graphs to capture the four main features aspects of document layout components, including syntactic, semantic, density, and appearance features. Then, we apply graph convolutional networks to enhance each aspect of features and apply the node-level pooling for integration. Finally, we concatenate features of all aspects and feed them into the 2-layer MLPs for document layout component classification. Our Doc-GCN achieves state-of-the-art results on three widely used DLA datasets: PubLayNet, FUNSD, and DocBank. The code will be released at https://github.com/adlnlp/doc_gcn</abstract>
<identifier type="citekey">luo-etal-2022-doc</identifier>
<location>
<url>https://aclanthology.org/2022.coling-1.256</url>
</location>
<part>
<date>2022-10</date>
<extent unit="page">
<start>2906</start>
<end>2916</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Doc-GCN: Heterogeneous Graph Convolutional Networks for Document Layout Analysis
%A Luo, Siwen
%A Ding, Yihao
%A Long, Siqu
%A Poon, Josiah
%A Han, Soyeon Caren
%Y Calzolari, Nicoletta
%Y Huang, Chu-Ren
%Y Kim, Hansaem
%Y Pustejovsky, James
%Y Wanner, Leo
%Y Choi, Key-Sun
%Y Ryu, Pum-Mo
%Y Chen, Hsin-Hsi
%Y Donatelli, Lucia
%Y Ji, Heng
%Y Kurohashi, Sadao
%Y Paggio, Patrizia
%Y Xue, Nianwen
%Y Kim, Seokhwan
%Y Hahm, Younggyun
%Y He, Zhong
%Y Lee, Tony Kyungil
%Y Santus, Enrico
%Y Bond, Francis
%Y Na, Seung-Hoon
%S Proceedings of the 29th International Conference on Computational Linguistics
%D 2022
%8 October
%I International Committee on Computational Linguistics
%C Gyeongju, Republic of Korea
%F luo-etal-2022-doc
%X Recognizing the layout of unstructured digital documents is crucial when parsing the documents into the structured, machine-readable format for downstream applications. Recent studies in Document Layout Analysis usually rely on visual cues to understand documents while ignoring other information, such as contextual information or the relationships between document layout components, which are vital to boost better layout analysis performance. Our Doc-GCN presents an effective way to harmonize and integrate heterogeneous aspects for Document Layout Analysis. We construct different graphs to capture the four main features aspects of document layout components, including syntactic, semantic, density, and appearance features. Then, we apply graph convolutional networks to enhance each aspect of features and apply the node-level pooling for integration. Finally, we concatenate features of all aspects and feed them into the 2-layer MLPs for document layout component classification. Our Doc-GCN achieves state-of-the-art results on three widely used DLA datasets: PubLayNet, FUNSD, and DocBank. The code will be released at https://github.com/adlnlp/doc_gcn
%U https://aclanthology.org/2022.coling-1.256
%P 2906-2916
Markdown (Informal)
[Doc-GCN: Heterogeneous Graph Convolutional Networks for Document Layout Analysis](https://aclanthology.org/2022.coling-1.256) (Luo et al., COLING 2022)
ACL