@inproceedings{samdarshi-etal-2024-connecting,
title = "Connecting the Dots: Evaluating Abstract Reasoning Capabilities of {LLM}s Using the {N}ew {Y}ork {T}imes Connections Word Game",
author = "Samdarshi, Prisha and
Mustafa, Mariam and
Kulkarni, Anushka and
Rothkopf, Raven and
Chakrabarty, Tuhin and
Muresan, Smaranda",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.emnlp-main.1182/",
doi = "10.18653/v1/2024.emnlp-main.1182",
pages = "21219--21236",
abstract = "The New York Times Connections game has emerged as a popular and challenging pursuit for word puzzle enthusiasts. We collect438 Connections games to evaluate the performance of state-of-the-art large language models (LLMs) against expert and novice humanplayers. Our results show that even the best-performing LLM, Claude 3.5 Sonnet, which has otherwise shown impressive reasoning abilities on a wide variety of benchmarks, can only fully solve 18{\%} of the games. Novice and expert players perform better than Claude 3.5 Sonnet, with expert human players significantly outperforming it. We create a taxonomy of the knowledge types required to successfully cluster and categorize words in the Connections game. We find that while LLMs are decent at categorizing words based on semantic relations they struggle with other types of knowledge such as Encyclopedic Knowledge, Multiword Expressions or knowledge that combines both Word Form and Meaning. Our results establish the New York Times Connections game as a challenging benchmark for evaluating abstract reasoning capabilities in humans and AI systems."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="samdarshi-etal-2024-connecting">
<titleInfo>
<title>Connecting the Dots: Evaluating Abstract Reasoning Capabilities of LLMs Using the New York Times Connections Word Game</title>
</titleInfo>
<name type="personal">
<namePart type="given">Prisha</namePart>
<namePart type="family">Samdarshi</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mariam</namePart>
<namePart type="family">Mustafa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anushka</namePart>
<namePart type="family">Kulkarni</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Raven</namePart>
<namePart type="family">Rothkopf</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tuhin</namePart>
<namePart type="family">Chakrabarty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Smaranda</namePart>
<namePart type="family">Muresan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2024-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yaser</namePart>
<namePart type="family">Al-Onaizan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mohit</namePart>
<namePart type="family">Bansal</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yun-Nung</namePart>
<namePart type="family">Chen</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Miami, Florida, USA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The New York Times Connections game has emerged as a popular and challenging pursuit for word puzzle enthusiasts. We collect438 Connections games to evaluate the performance of state-of-the-art large language models (LLMs) against expert and novice humanplayers. Our results show that even the best-performing LLM, Claude 3.5 Sonnet, which has otherwise shown impressive reasoning abilities on a wide variety of benchmarks, can only fully solve 18% of the games. Novice and expert players perform better than Claude 3.5 Sonnet, with expert human players significantly outperforming it. We create a taxonomy of the knowledge types required to successfully cluster and categorize words in the Connections game. We find that while LLMs are decent at categorizing words based on semantic relations they struggle with other types of knowledge such as Encyclopedic Knowledge, Multiword Expressions or knowledge that combines both Word Form and Meaning. Our results establish the New York Times Connections game as a challenging benchmark for evaluating abstract reasoning capabilities in humans and AI systems.</abstract>
<identifier type="citekey">samdarshi-etal-2024-connecting</identifier>
<identifier type="doi">10.18653/v1/2024.emnlp-main.1182</identifier>
<location>
<url>https://aclanthology.org/2024.emnlp-main.1182/</url>
</location>
<part>
<date>2024-11</date>
<extent unit="page">
<start>21219</start>
<end>21236</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Connecting the Dots: Evaluating Abstract Reasoning Capabilities of LLMs Using the New York Times Connections Word Game
%A Samdarshi, Prisha
%A Mustafa, Mariam
%A Kulkarni, Anushka
%A Rothkopf, Raven
%A Chakrabarty, Tuhin
%A Muresan, Smaranda
%Y Al-Onaizan, Yaser
%Y Bansal, Mohit
%Y Chen, Yun-Nung
%S Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing
%D 2024
%8 November
%I Association for Computational Linguistics
%C Miami, Florida, USA
%F samdarshi-etal-2024-connecting
%X The New York Times Connections game has emerged as a popular and challenging pursuit for word puzzle enthusiasts. We collect438 Connections games to evaluate the performance of state-of-the-art large language models (LLMs) against expert and novice humanplayers. Our results show that even the best-performing LLM, Claude 3.5 Sonnet, which has otherwise shown impressive reasoning abilities on a wide variety of benchmarks, can only fully solve 18% of the games. Novice and expert players perform better than Claude 3.5 Sonnet, with expert human players significantly outperforming it. We create a taxonomy of the knowledge types required to successfully cluster and categorize words in the Connections game. We find that while LLMs are decent at categorizing words based on semantic relations they struggle with other types of knowledge such as Encyclopedic Knowledge, Multiword Expressions or knowledge that combines both Word Form and Meaning. Our results establish the New York Times Connections game as a challenging benchmark for evaluating abstract reasoning capabilities in humans and AI systems.
%R 10.18653/v1/2024.emnlp-main.1182
%U https://aclanthology.org/2024.emnlp-main.1182/
%U https://doi.org/10.18653/v1/2024.emnlp-main.1182
%P 21219-21236
Markdown (Informal)
[Connecting the Dots: Evaluating Abstract Reasoning Capabilities of LLMs Using the New York Times Connections Word Game](https://aclanthology.org/2024.emnlp-main.1182/) (Samdarshi et al., EMNLP 2024)
- Connecting the Dots: Evaluating Abstract Reasoning Capabilities of LLMs Using the New York Times Connections Word Game (Samdarshi et al., EMNLP 2024)
ACL
- Prisha Samdarshi, Mariam Mustafa, Anushka Kulkarni, Raven Rothkopf, Tuhin Chakrabarty, and Smaranda Muresan. 2024. Connecting the Dots: Evaluating Abstract Reasoning Capabilities of LLMs Using the New York Times Connections Word Game. In Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 21219–21236, Miami, Florida, USA. Association for Computational Linguistics.