0% found this document useful (0 votes)

24 views

Working With Text Data in R

This document provides examples of functions for working with text data in R, including formatting strings, detecting matches, splitting strings, and extracting matches. Some key functions demonstrated are sprintf() for formatting numbers, str_view_all() to highlight string matches, str_detect() to detect regex patterns, str_extract() to extract matches, and str_split() to split strings. Packages like stringr, snakecase, and glue provide many useful functions for text manipulation and regular expressions.

Uploaded by

Clóvis Nóbrega

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

24 views

Working With Text Data in R

Uploaded by

Clóvis Nóbrega

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 1

> Formatting strings > Detecting matches

Working with text data in R # Format numbers with sprintf()

sprintf("%.3e", pi) # "3.142e+00"

# Highlight string matches in HTML widget with str_view_all()

str_view_all(suits, "[ae]")

Clubs 
# Substitute value in a string with an expression
Diamonds 
Learn R online at www.DataCamp.com glue('The answer is {ans}', ans = 30 + 10) # The answer is 40

Hearts 
Spades
# Substitute value in a string with an expression

cards <- data.frame(value = c("8", "Queen", "Ace"),

# Detect if a regex pattern is present in strings with str_detect()

suit = c("Diamonds", "Hearts", "Spades"))

str_detect(suits, "[ae]") # FALSE TRUE TRUE TRUE

> Packages to install for this cheat sheet cards %>% glue_data("{value} of {suit}") 

# Find the index of strings that match a regex with str_which()

str_which(suits, "[ae]") # 2 3 4

# 8 of Diamonds

# Queen of Hearts

Some functionality from this cheat sheet comes with base-R, but the following packages are also used # Count the number of matches with str_count()

throughout this cheat sheet. # Ace of Spades

str_count(suits, "[ae]") # 0 1 2 2

library(stringr)
# Wrap strings across multiple lines
# Locate the position of matches within strings with str_locate()

library(snakecase)
str_wrap('The answer to the universe is 42', width = 25)
str_locate(suits, "[ae]")

library(glue) # The answer to the

# start end

# universe is 42 # [1,] NA NA

Functions with names starting str_ are from stringr; those with names starting to_ are from snakecase;
those with glue in the name are from glue. # [2,] 3 3

# [3,] 2 2

# [4,] 3 3

> Splitting strings

> Example data > Extracting matches
# Split strings into list of characters with str_split(pattern = "")

Throughout this cheat sheet, we’ll be using this vector containing the following strings. str_split(suits, pattern = "") 

# Extract matches from strings with str_extract()

suits <- c("Clubs", "Diamonds", "Hearts", "Spades")

# "C" "l" "u" "b" "s"
str_extract(suits, ".[ae].") # NA "iam" "Hea" "pad"

# "D" "i" "a" "m" "o" "n" "d" "s"

# "H" "e" "a" "r" "t" "s"

# Extract matches and capture groups with str_match()

str_match(suits, ".([ae])(.)") 

> Get string lengths and substrings

# "S" "p" "a" "d" "e" "s" 

# Split strings by a separator with str_split()

# [,1] [,2] [,3]

str_split(suits, pattern = "a") 

# [1,] NA NA NA

# Get the number of characters with nchar()

# [2,] "iam" "a" "m"

nchar(suits) # Returns 5 8 6 6

# [3,] "Hea" "e" "a"

# "Clubs"

# [4,] "pad" "a" "d"

# Get substrings by position with str_sub()

# "Di" "monds"

stringr::str_sub(suits, 1, 4) # Returns "Club" "Diam" "Hear" "Spad"

# "He" "rts"

# Get subset of strings that match with str_subset()

# "Sp" "des" 

str_subset(suits, "d") # "Diamonds" "Spades"

# Remove whitespace from the start/end with str_trim()

str_trim(" Lost in Whitespace ") # Returns "Lost in Whitespace"

# Split strings into matrix of n pieces with str_split_fixed()

str_split_fixed(suits, pattern = 'a', n = 2) 

> Replacing matches

# Truncate strings to a maximum width with str_trunc()

str_trunc(suits, width = 5) # Returns "Clubs" "Di..." "He..." "Sp..."

# [,1] [,2]

# [1,] "Clubs" ""

# Pad strings to a constant width with str_pad()

# [2,] "Di" "monds"

str_pad(suits, width = 8) # Returns " Clubs" "Diamonds" " Hearts" " Spades"

# Replace a regex match with another string with str_replace()

# [3,] "He" "rts"

str_replace(suits, "a", "4") # "Clubs" "Di4monds" "He4rts" "Sp4des"

# Pad strings on right with str_pad(side="right")

# [4,] "Sp" "des"
str_pad(suits, width = 8, side = "right", pad = "!")
# Remove a match with str_remove()

# Returns "Clubs!!!" "Diamonds" "Hearts!!" "Spades!!"

str_remove(suits, "s") # "Club" "Diamond" "Heart" "Spade"

> Joining or concatenating strings # Replace a substring with `str_sub<-`()

str_sub(suits, start = 1, end = 3) <- c("Bi", "Al", "Yu", "Hi")

suits # Returns "Bibs" "Almonds" "Yurts" "Hides"

> Changing case # Combine two strings with paste0()

paste0(suits, '5') # "Clubs5" "Diamonds5" "Hearts5" "Spades5"

# Convert to lowercase with tolower()

# Combine strings with a separator with paste()

tolower(suits) # Returns "clubs" "diamonds" "hearts" "spades" 

paste(5, suits, sep = " of ") # "5 of Clubs" "5 of Diamonds" "5 of Hearts" "5 of
Spades"

# Convert to uppercase with toupper()

toupper(suits) # Returns "CLUBS" "DIAMONDS" "HEARTS" "SPADES" 

Learn R Online at
# Collapse character vector to string with paste() or paste0()

paste(suits, collapse = ", ") # "Clubs, Diamonds, Hearts, Spades"

www.DataCamp.com
# Convert to title case with to_title_case()

to_title_case("hello, world!") # Returns "Hello, World!" 

# Duplicate and concatenate strings with str_dup()

# Convert to sentence case with to_sentence_case()

str_dup(suits, 2) # "ClubsClubs" "DiamondsDiamonds" "HeartsHearts"
to_sentence_case("hello, world!") # Returns "Hello, world!" "SpadesSpades"

Automatic Switch Light Project Report
100% (1)
Automatic Switch Light Project Report
9 pages
Working With Text Data in Python
No ratings yet
Working With Text Data in Python
1 page
Chuletas DataCamp-3
No ratings yet
Chuletas DataCamp-3
1 page
String Manipulation With Stringr::: Cheat Sheet
No ratings yet
String Manipulation With Stringr::: Cheat Sheet
2 pages
Chapter 4 Array And String
No ratings yet
Chapter 4 Array And String
15 pages
String Functions
No ratings yet
String Functions
10 pages
Array in C
No ratings yet
Array in C
6 pages
ATA Tructures In: Pavan Kumar A
No ratings yet
ATA Tructures In: Pavan Kumar A
35 pages
Regex
No ratings yet
Regex
1 page
Regex
No ratings yet
Regex
1 page
InternetProgramming PHPArrays
No ratings yet
InternetProgramming PHPArrays
40 pages
Regular Expressions: Regular Expression Syntax in Python
No ratings yet
Regular Expressions: Regular Expression Syntax in Python
11 pages
Mission 346 Working With Strings in Pandas Takeaways
No ratings yet
Mission 346 Working With Strings in Pandas Takeaways
2 pages
Lecture 6 Re Basics
No ratings yet
Lecture 6 Re Basics
12 pages
Character Description Example: Uncomplicating The Complicated
No ratings yet
Character Description Example: Uncomplicating The Complicated
2 pages
UNIT - 5
No ratings yet
UNIT - 5
22 pages
Reg Ex Cheat Sheet
No ratings yet
Reg Ex Cheat Sheet
1 page
Reg Ex Cheat Sheet
No ratings yet
Reg Ex Cheat Sheet
1 page
Array
No ratings yet
Array
16 pages
13B RegExp
No ratings yet
13B RegExp
38 pages
Week3 2020
No ratings yet
Week3 2020
20 pages
Base-R
No ratings yet
Base-R
9 pages
Unit 3 Strings
No ratings yet
Unit 3 Strings
15 pages
CH 7 Array
No ratings yet
CH 7 Array
26 pages
UNIT III CProgramming
No ratings yet
UNIT III CProgramming
22 pages
Arrays
No ratings yet
Arrays
26 pages
Ch 7 Array (1)
No ratings yet
Ch 7 Array (1)
28 pages
Manipulating Text with Regular Expression in python
No ratings yet
Manipulating Text with Regular Expression in python
4 pages
R Programming
No ratings yet
R Programming
37 pages
What Is An Array
No ratings yet
What Is An Array
11 pages
Day 4
No ratings yet
Day 4
26 pages
pps_Array
No ratings yet
pps_Array
12 pages
Array & String
No ratings yet
Array & String
34 pages
04 CS3001
No ratings yet
04 CS3001
9 pages
Regular Exp
No ratings yet
Regular Exp
6 pages
Data & Variable Transformation: Recode and Transform Variables Summarise Variables and Cases Descriptives and Summaries
No ratings yet
Data & Variable Transformation: Recode and Transform Variables Summarise Variables and Cases Descriptives and Summaries
1 page
Work With Strings With Stringr::: Cheat Sheet
No ratings yet
Work With Strings With Stringr::: Cheat Sheet
2 pages
Unit-III-PIC-QA If /co
No ratings yet
Unit-III-PIC-QA If /co
34 pages
R 1st unit
No ratings yet
R 1st unit
61 pages
C# String Manipulation
No ratings yet
C# String Manipulation
3 pages
String_PPT - Copy
No ratings yet
String_PPT - Copy
36 pages
Unite28093v Character Arrays Strings File
No ratings yet
Unite28093v Character Arrays Strings File
18 pages
Programming in C (Common To Agri, Civil, Mech, Eee and Eie)
No ratings yet
Programming in C (Common To Agri, Civil, Mech, Eee and Eie)
26 pages
Array and String
No ratings yet
Array and String
26 pages
Chapter 2 Data Structures in R
No ratings yet
Chapter 2 Data Structures in R
14 pages
Question Bank UNIT III
No ratings yet
Question Bank UNIT III
7 pages
Unit Iii
No ratings yet
Unit Iii
11 pages
REGEX in Data Analytics
No ratings yet
REGEX in Data Analytics
5 pages
Ralph Bryell Louie A. Olaso Bscpe 3 Homework No. 4
No ratings yet
Ralph Bryell Louie A. Olaso Bscpe 3 Homework No. 4
10 pages
L18 L19 Strings
No ratings yet
L18 L19 Strings
38 pages
Arrays: Q) Write A Short Notes On Arrays in C?
No ratings yet
Arrays: Q) Write A Short Notes On Arrays in C?
7 pages
03a Character and String Processing
No ratings yet
03a Character and String Processing
18 pages
Arrays 1D
No ratings yet
Arrays 1D
7 pages
Unit-3
No ratings yet
Unit-3
12 pages
14 Strings _ R for Data Science
No ratings yet
14 Strings _ R for Data Science
19 pages
CH 3
No ratings yet
CH 3
33 pages
Arrays and Strings
No ratings yet
Arrays and Strings
15 pages
Strings and Stack Operations (Arrays and Dynamic Memory)
No ratings yet
Strings and Stack Operations (Arrays and Dynamic Memory)
28 pages
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Ian Talks Regex A-Z
From Everand
Ian Talks Regex A-Z
Ian Eress
No ratings yet
Introduction to PHP, Part 2, Second Edition
From Everand
Introduction to PHP, Part 2, Second Edition
Adam Majczak
No ratings yet
Computer-Aided_Design_in_the_United_States_19491984_Designing_in_a_Closed_World
No ratings yet
Computer-Aided_Design_in_the_United_States_19491984_Designing_in_a_Closed_World
17 pages
Solidworks Swift Technology: Inspiration
No ratings yet
Solidworks Swift Technology: Inspiration
9 pages
Excelandia User Guide V2
No ratings yet
Excelandia User Guide V2
27 pages
Config Padrão de SW HP-V1910
No ratings yet
Config Padrão de SW HP-V1910
6 pages
Strobe and Handshake Signal
No ratings yet
Strobe and Handshake Signal
11 pages
Pmac Plot PDF
No ratings yet
Pmac Plot PDF
25 pages
Excel Formulas
No ratings yet
Excel Formulas
3 pages
Development of Electronic Banking in Bangladesh
100% (2)
Development of Electronic Banking in Bangladesh
7 pages
Goes To Campus: Last Update Desember 2018
No ratings yet
Goes To Campus: Last Update Desember 2018
27 pages
CS101 Solved Subjective Part Midterm File
No ratings yet
CS101 Solved Subjective Part Midterm File
40 pages
RetailManagerUserGuide PDF
No ratings yet
RetailManagerUserGuide PDF
322 pages
Introduction: Databases and Database Users
No ratings yet
Introduction: Databases and Database Users
29 pages
2023 2 1
No ratings yet
2023 2 1
8 pages
CAM2 MEASURE Alignment PDF
No ratings yet
CAM2 MEASURE Alignment PDF
2 pages
Sanyo Ds27530 Service Manual
No ratings yet
Sanyo Ds27530 Service Manual
3 pages
Designing of Rom: - Roms
No ratings yet
Designing of Rom: - Roms
14 pages
Kiran Abinitio
No ratings yet
Kiran Abinitio
66 pages
MSC Data Science Oncampus 2020
No ratings yet
MSC Data Science Oncampus 2020
14 pages
SSM Institute of Engineering and Technology: P.Mohana Karthiga A.P/EEE
No ratings yet
SSM Institute of Engineering and Technology: P.Mohana Karthiga A.P/EEE
7 pages
Math 8 q2w1 Enhanced20pdf
No ratings yet
Math 8 q2w1 Enhanced20pdf
18 pages
wqd10202 Technicalmathii Complex Number PDF
No ratings yet
wqd10202 Technicalmathii Complex Number PDF
33 pages
Expert SQL Server In-Memory OLTP 2nd Edition Dmitri Korotkevitch 2024 scribd download
100% (1)
Expert SQL Server In-Memory OLTP 2nd Edition Dmitri Korotkevitch 2024 scribd download
55 pages
Laudon-Traver Ec10 PPT ch01
No ratings yet
Laudon-Traver Ec10 PPT ch01
31 pages
In-Vehicle Networking: Introduce Class
No ratings yet
In-Vehicle Networking: Introduce Class
9 pages
More Solutions PDF
No ratings yet
More Solutions PDF
59 pages
Itr Shoe Bonded
No ratings yet
Itr Shoe Bonded
2 pages
ccs372 Vir Manual
No ratings yet
ccs372 Vir Manual
120 pages
EDIFACT
No ratings yet
EDIFACT
71 pages
Modern Systems Analysis and Design
No ratings yet
Modern Systems Analysis and Design
35 pages

Working With Text Data in R

Uploaded by

Working With Text Data in R

Uploaded by

> Formatting strings > Detecting matches

Working with text data in R # Format numbers with sprintf()

sprintf("%.3e", pi) # "3.142e+00"

# Highlight string matches in HTML widget with str_view_all()

cards <- data.frame(value = c("8", "Queen", "Ace"),

# Detect if a regex pattern is present in strings with str_detect()

suit = c("Diamonds", "Hearts", "Spades"))

# Find the index of strings that match a regex with str_which()

throughout this cheat sheet. # Ace of Spades

library(glue) # The answer to the

> Splitting strings

# Extract matches from strings with str_extract()

suits <- c("Clubs", "Diamonds", "Hearts", "Spades")

# "D" "i" "a" "m" "o" "n" "d" "s"

# "H" "e" "a" "r" "t" "s"

> Get string lengths and substrings

# Split strings by a separator with str_split()

str_split(suits, pattern = "a")

# Get the number of characters with nchar()

# [3,] "Hea" "e" "a"

# [4,] "pad" "a" "d"

# Get substrings by position with str_sub()

stringr::str_sub(suits, 1, 4) # Returns "Club" "Diam" "Hear" "Spad"

# Get subset of strings that match with str_subset()

str_subset(suits, "d") # "Diamonds" "Spades"

str_trim(" Lost in Whitespace ") # Returns "Lost in Whitespace"

# Split strings into matrix of n pieces with str_split_fixed()

str_split_fixed(suits, pattern = 'a', n = 2)

> Replacing matches

str_trunc(suits, width = 5) # Returns "Clubs" "Di..." "He..." "Sp..."

# [1,] "Clubs" ""

# Pad strings to a constant width with str_pad()

# [2,] "Di" "monds"

# Replace a regex match with another string with str_replace()

# [3,] "He" "rts"

str_replace(suits, "a", "4") # "Clubs" "Di4monds" "He4rts" "Sp4des"

# Pad strings on right with str_pad(side="right")

# Returns "Clubs!!!" "Diamonds" "Hearts!!" "Spades!!"

> Joining or concatenating strings # Replace a substring with `str_sub<-`()

str_sub(suits, start = 1, end = 3) <- c("Bi", "Al", "Yu", "Hi")

suits # Returns "Bibs" "Almonds" "Yurts" "Hides"

paste0(suits, '5') # "Clubs5" "Diamonds5" "Hearts5" "Spades5"

# Convert to lowercase with tolower()

tolower(suits) # Returns "clubs" "diamonds" "hearts" "spades"

# Convert to uppercase with toupper()

toupper(suits) # Returns "CLUBS" "DIAMONDS" "HEARTS" "SPADES"

paste(suits, collapse = ", ") # "Clubs, Diamonds, Hearts, Spades"

to_title_case("hello, world!") # Returns "Hello, World!"

# Duplicate and concatenate strings with str_dup()

# Convert to sentence case with to_sentence_case()

You might also like

str_split(suits, pattern = "a") 

str_split_fixed(suits, pattern = 'a', n = 2) 

tolower(suits) # Returns "clubs" "diamonds" "hearts" "spades" 

toupper(suits) # Returns "CLUBS" "DIAMONDS" "HEARTS" "SPADES" 

to_title_case("hello, world!") # Returns "Hello, World!"