Data Analyst in R

Introduction to Data Analysis in R
125L
What is Literal? | Webopedia
R: Arithmetic Operators (ethz.ch)

128L %/% 5L
FALSE
10.50 * 200L
21.15 * 50L
3.20 * 500L
2100 + 1057.5 + 1600
12.10 * 200
24.90 * 50
4.99 * 500
12.10 * 200 + 24.90 * 50 + 4.99 * 500

total_purchase_cost <- 10.50 * 200L + 21.15 * 50L + 3.20 * 500L
total_selling_cost <- 12.10 * 200 + 24.90 * 50 + 4.99 * 500

profit_1 <- total_selling_cost - total_purchase_cost
q_1 <- class(total_purchase_cost)
q_2 <- class(total_selling_cost)
q_3 <- class(profit_1)

Consistent naming conventions in R | R-bloggers
Logical Expressions in R
R: Relational Operators (ethz.ch)
R: Logical Operators (ethz.ch)

Project: Install RStudio
Data Visualization in R
library(readr)
life_expec <- read_csv("life_expec.csv")
life_expec %>%
ggplot()
life_expec_filter <- life_expec %>%
filter(Race == "All Races" & Sex == "Both Sexes")
life_expec_filter %>%
ggplot(aes(x = Year, y = Avg_Life_Expec)) +
geom_line() +
labs(
title = "United States Life Expectancy: 100 Years of Change",
y = "Average Life Expectancy (Years)"
)
String Manipulation in R: Fundamentals
library(tidyverse)
# Supply col_types = cols() to suppress column specification output
recent_grads <- read_csv("recent_grads.csv", col_types = cols())
lower_case_colnames <- str_to_lower(colnames(recent_grads))
colnames(recent_grads) <- lower_case_colnames

library(tidyverse)
input_sentence <- "You'll need to split this sentence."
output1 <- str_split(input_sentence, " ")[[1]]
output2 <- output1[1:4]

Function reference • stringr (tidyverse.org)
Regular expressions • stringr (tidyverse.org)
Regular-Expressions.info - Regex Tutorial, Examples and Reference - Regexp Patterns
Working with APIs

# Make a get request to get the latest position of the ISS from the OpenNotify API.
response = requests.get("http://api.open-notify.org/iss-now.json")
status_code = response.status_code
# Enter your answer below.
response = requests.get("http://api.open-notify.org/iss-pass")
status_code = response.status_code
# Set up the parameters we want to pass to the API.
# This is the latitude and longitude of New York City.
parameters = {"lat": 40.71, "lon": -74}
# Make a get request with the parameters.
response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)
# Print the content of the response (the data the server returned)
print(response.content)
# This gets the same data as the command above

response = requests.get("http://api.open-notify.org/iss-pass.json?lat=40.71&lon=-74")
print(response.content)
parameters = {"lat": 37.78, "lon": -122.41}
response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)
content = response.content
# Make a list of fast food chains.
best_food_chains = ["Taco Bell", "Shake Shack", "Chipotle"]
print(type(best_food_chains))
# Import the JSON library.
import json
# Use json.dumps to convert best_food_chains to a string.
best_food_chains_string = json.dumps(best_food_chains)
print(type(best_food_chains_string))
# Convert best_food_chains_string back to a list.
print(type(json.loads(best_food_chains_string)))
# Make a dictionary
fast_food_franchise = {
"Subway": 24722,
"McDonalds": 14098,
"Starbucks": 10821,
"Pizza Hut": 7600
# We can also dump a dictionary to a string and load it.
fast_food_franchise_string = json.dumps(fast_food_franchise)
print(type(fast_food_franchise_string))
fast_food_franchise_2 = json.loads(fast_food_franchise_string)
# Headers is a dictionary
print(response.headers)
content_type = response.headers["content-type"]
# Call the API here.
response = requests.get("http://api.open-notify.org/astros.json")
json_data = response.json()
in_space_count = json_data["number"]
Requests: HTTP for Humans™ — Requests 2.26.0 documentation (python-requests.org)
19.2. json — JSON encoder and decoder — Python 3.6.15 documentation

# Create a dictionary of headers containing our Authorization header.
headers = {"Authorization": "token 1f36137fbbe1602f779300dad26e4c1b7fbab631"}
# Make a GET request to the GitHub API with our headers.
# This API endpoint will give us details about Vik Paruchuri.

response = requests.get("https://api.github.com/users/VikParuchuri", headers=headers)
# Print the content of the response. As you can see, this token corresponds to the account of Vik
Paruchuri.
print(response.json())
response = requests.get("https://api.github.com/users/VikParuchuri/orgs", headers=headers)
orgs = response.json()
# We've loaded headers in.
response = requests.get("https://api.github.com/users/torvalds", headers=headers)
torvalds = response.json()
# Enter your answer here.

response = requests.get("https://api.github.com/repos/octocat/Hello-World", headers=headers)
hello_world = response.json()
params = {"per_page": 50, "page": 1}
response = requests.get("https://api.github.com/users/VikParuchuri/starred", headers=headers,

params=params)
page1_repos = response.json()
params = {"per_page": 50, "page": 2}
response = requests.get("https://api.github.com/users/VikParuchuri/starred", headers=headers,

params=params)
page2_repos = response.json()
# Enter your code here.
response = requests.get("https://api.github.com/user", headers=headers)
# Create the data we'll pass into the API endpoint. While this endpoint only requires the "name" key,
there are other optional keys.
payload = {"name": "test"}
# We need to pass in our authentication headers!
response = requests.post("https://api.github.com/user/repos", json=payload, headers=headers)
print(response.status_code)
payload = {"name": "learning-about-apis"}
response = requests.post("https://api.github.com/user/repos", json=payload, headers=headers)

status = response.status_code
payload = {"description": "The best repository ever!", "name": "test"}
response = requests.patch("https://api.github.com/repos/VikParuchuri/test", json=payload,

headers=headers)
payload = {"description": "Learning about requests!", "name": "learning-about-apis"}
response = requests.patch("https://api.github.com/repos/VikParuchuri/learning-about-apis",
json=payload, headers=headers)
user = response.json()
response = requests.delete("https://api.github.com/repos/VikParuchuri/test", headers=headers)
response = requests.delete("https://api.github.com/repos/VikParuchuri/learning-about-apis",
headers=headers)
response = requests.delete("https://api.github.com/repos/VikParuchuri/test", headers=headers)
response = requests.delete("https://api.github.com/repos/VikParuchuri/learning-about-apis",
headers=headers)
Sign in to GitHub · GitHub
GitHub REST API - GitHub Docs

GitHub REST API - GitHub Docs
Not Found! (spring.io)
python_top_articles = python_top["data"]["children"]
most_upvoted = ""
most_upvotes = 0
for article in python_top_articles:
ar = article["data"]
if ar["ups"] >= most_upvotes:
most_upvoted = ar["id"]
most_upvotes = ar["ups"]
headers = {"Authorization": "bearer 13426216-4U1ckno9J5AiK72VRbpEeBaMSKk", "User-Agent":
"Dataquest/1.0"}
response = requests.get("https://oauth.reddit.com/r/python/comments/4b7w9u", headers=headers)
comments = response.json()
payload = {"dir": 1, "id": "d16y4ry"}
headers = {"Authorization": "bearer 13426216-4U1ckno9J5AiK72VRbpEeBaMSKk", "User-Agent":

"Dataquest/1.0"}
response = requests.post("https://oauth.reddit.com/api/vote", json=payload, headers=headers)
Web Scraping
parser = BeautifulSoup(content, 'html.parser')
# Get a list of all occurrences of the body tag in the element.
body = parser.find_all("body")
# Get the paragraph tag.
p = body[0].find_all("p")
# Get the text.
print(p[0].text)
head = parser.find_all("head")
title = head[0].find_all("title")
title_text = title[0].text
# Get the website that contains classes.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_classes.html")
# Get the first inner paragraph.
# Find all the paragraph tags with the class inner-text.
# Then, take the first element in that list.
first_inner_paragraph = parser.find_all("p", class_="inner-text")[0]
print(first_inner_paragraph.text)
second_inner_paragraph = parser.find_all("p", class_="inner-text")[1]
second_inner_paragraph_text = second_inner_paragraph.text
first_outer_paragraph = parser.find_all("p", class_="outer-text")[0]
first_outer_paragraph_text = first_outer_paragraph.text
# Get the website that contains classes and IDs.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
# Select all of the elements that have the first-item class.
first_items = parser.select(".first-item")
# Print the text of the first paragraph (the first element with the first-item class).
print(first_items[0].text)
first_outer_text = parser.select(".outer-text")[0].text
second_text = parser.select("#second")[0].text
# Get the website that contains classes and IDs.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
# Select all of the elements that have the first-item class.
first_items = parser.select(".first-item")
# Print the text of the first paragraph (the first element with the first-item class).
print(first_items[0].text)
first_outer_text = parser.select(".outer-text")[0].text
second_text = parser.select("#second")[0].text
# Get the Superbowl box score data.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html")
# Find the number of turnovers the Seahawks committed.
turnovers = parser.select("#turnovers")[0]
seahawks_turnovers = turnovers.select("td")[1]
seahawks_turnovers_count = seahawks_turnovers.text
print(seahawks_turnovers_count)
patriots_total_plays_count = parser.select("#total-plays")[0].select("td")[2].text
seahawks_total_yards_count = parser.select("#total-yards")[0].select("td")[1].text
HTML basics - Learn web development | MDN (mozilla.org)
HTML elements reference - HTML: HyperText Markup Language | MDN (mozilla.org)
Beautiful Soup Documentation — Beautiful Soup 4.9.0 documentation (crummy.com)
Fuzzy Language in Data Science

# ans = "A"
# ans = "B"
# ans = "C"
ans = "A"
best_churn["scaled_tran"] = (best_churn["nr_of_transactions"] \
- best_churn["nr_of_transactions"].min()) \
/ (best_churn["nr_of_transactions"].max() \
- best_churn["nr_of_transactions"].min())
best_churn["scaled_amount"] = (best_churn["amount_spent"] \
-best_churn["amount_spent"].min()) \
/ (best_churn["amount_spent"].max() \
- best_churn["amount_spent"].min())
best_churn["score"] = 100*(.5*best_churn["scaled_tran"] \
+ .5*best_churn["scaled_amount"])
best_churn.sort_values("score", inplace=True, ascending=False)
top_50_churned = best_churn.loc[best_churn["churned"] == 1].head(50)
top_50_churned.to_csv("best_customers.txt")
Communicating Results
import pandas as pd
playstore = pd.read_csv("googleplaystore.csv")
print(playstore.shape)
answer="no" # We don't care about free apps for this project

playstore.drop(labels=10472, inplace=True)
paid.sort_values("Reviews", ascending=False, inplace=True)
paid.drop_duplicates("App", inplace=True)
print(paid.duplicated("App").sum())
paid.reset_index(inplace=True, drop=True)
affordable_apps = paid[paid["Price"]<50].copy()
cheap = affordable_apps["Price"] < 5
reasonable = affordable_apps["Price"] >= 5
affordable_apps[cheap].hist(column="Price", grid=False, figsize=(12,6))

affordable_apps[reasonable].hist(column="Price", grid=False, figsize=(12,6))
affordable_apps["affordability"] = affordable_apps.apply(
lambda row: "cheap" if row["Price"] < 5 else "reasonable",
axis=1
cheap = affordable_apps["Price"] < 5
reasonable = affordable_apps["Price"] >= 5
cheap_mean = affordable_apps.loc[cheap, "Price"].mean()
affordable_apps.loc[cheap, "price_criterion"] = affordable_apps["Price"].apply(
lambda price: 1 if price < cheap_mean else 0
affordable_apps[reasonable].plot(kind="scatter", x="Price", y="Rating")
reasonable_mean = affordable_apps.loc[reasonable, "Price"].mean()
affordable_apps.loc[reasonable,"price_criterion"] = affordable_apps["Price"].apply(
lambda price: 1 if price < reasonable_mean else 0
)
affordable_apps["genre_count"] = affordable_apps["Genres"].str.count(";")+1
genres_mean = affordable_apps.groupby(
["affordability", "genre_count"]
).mean()[["Price"]]
def label_genres(row):
"""For each segment in `genres_mean`,
labels the apps that cost less than its segment's mean with `1`
and the others with `0`."""
aff = row["affordability"]
gc = row["genre_count"]
price = row["Price"]
if price < genres_mean.loc[(aff, gc)][0]:

return 1
else:
return 0
affordable_apps["genre_criterion"] = affordable_apps.apply(
label_genres, axis="columns"
categories_mean = affordable_apps.groupby(
["affordability", "Category"]
).mean()[["Price"]]
def label_categories(row):
"""For each segment in `categories_mean`,
labels the apps that cost less than its segment's mean with `1`
and the others with `0`."""
aff = row["affordability"]
cat = row["Category"]
price = row["Price"]
if price < categories_mean.loc[(aff, cat)][0]:
return 1
else:
return 0
affordable_apps["category_criterion"] = affordable_apps.apply(
label_categories, axis="columns"
)
criteria = ["price_criterion", "genre_criterion", "category_criterion"]
affordable_apps["Result"] = affordable_apps[criteria].mode(axis='columns')
def new_price(row):
if row["affordability"] == "cheap":
return round(max(row["Price"], cheap_mean), 2)
else:
return round(max(row["Price"], reasonable_mean), 2)
affordable_apps["New Price"] = affordable_apps.apply(new_price, axis="columns")
affordable_apps["Installs"] = affordable_apps["Installs"].str.replace("[+,]", "").astype(int)
affordable_apps["Impact"] = (affordable_apps["New Price"]-

affordable_apps["Price"])*affordable_apps["Installs"]
total_impact = affordable_apps["Impact"].sum()
print(total_impact)
ans1 = "no"
ans21 = "no"
ans22 = "yes"
import pandas as pd
subs = pd.read_csv("muscle_labs.csv", parse_dates=["end_date", "start_date"])
subs["churn_month"] = subs["end_date"].dt.year*100 + subs["end_date"].dt.month

monthly_churn = pd.DataFrame({"total_churned": subs.groupby("churn_month").size()})
import datetime as dt
# arange = __import__("numpy").arange
# Ellipse = __import__("matplotlib").patches.Ellipse
# ax = churn.plot(x="yearmonth", y="churn_rate", figsize=(12,6), rot=45, marker=".")
# start, end = ax.get_xlim()
# ax.get_xticks()
# ax.set_xticks(arange(2, end, 3))
# ax.set_xticklabels(yearmonths[2::3])
# circle = Ellipse((35, churn.loc[churn.yearmonth == "201312", "churn_rate"].iloc[0]),
# 5, 0.065, color='sandybrown', fill=False
# )
# ax.add_artist(circle)
# ax.xaxis.label.set_visible(False)
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
# ax.get_legend().remove()
import datetime as dt
def get_customers(yearmonth):
year = yearmonth//100
month = yearmonth-year*100
date = dt.datetime(year, month, 1)
return ((subs["start_date"] < date) & (date <= subs["end_date"])).sum()
churn["total_customers"] = churn["yearmonth"].apply(get_customers)
churn["churn_rate"] = churn["total_churned"] / churn["total_customers"]
churn["yearmonth"] = churn["yearmonth"].astype(str)
arange = __import__("numpy").arange
Ellipse = __import__("matplotlib").patches.Ellipse
ax = churn.plot(x="yearmonth", y="churn_rate", figsize=(12,6), rot=45, marker=".")
start, end = ax.get_xlim()

ax.get_xticks()
ax.set_xticks(arange(2, end, 3))
ax.set_xticklabels(yearmonths[2::3])
circle = Ellipse((35, churn.loc[churn.yearmonth == "201312", "churn_rate"].iloc[0]),
5, 0.065, color='sandybrown', fill=False
ax.add_artist(circle)
ax.xaxis.label.set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_legend().remove()
The Mean
set.seed(1)
distribution <- sample.int(100, size=10)
ndistribution <- replicate(n=50, expr=sample.int(25, size=5))

set.seed(1)
checkDist <- function(){
distribution <- sample.int(1000, size=10)
mean <- sum(distribution) / length(distribution)
round(sum(distribution - mean)) == 0
equal_distances <- sum(replicate(n=5000, expr=checkDist()))

distribution_1 <- c(42, 24, 32, 11)
distribution_2 <- c(102, 32, 74, 15, 38, 45, 22)
distribution_3 <- c(3, 12, 7, 2, 15, 1, 21)
compute_mean <- function(distribution) {
N <- length(distribution)
sum_of_the_distribution = 0
for ( i in 1:N) {
sum_of_the_distribution <- sum_of_the_distribution + distribution[i]
sum_of_the_distribution / N
}
mean_1 <- compute_mean(distribution_1)
library(readr)
houses <- read_tsv('AmesHousing_1.txt')
one <- TRUE # every column that describes years is measured on an interval scale
two <- FALSE # `SalePrice` is measured on a ratio scale
three <- TRUE # The data set has less values than the initial one with 3970 rows which we don't
know either whether it represents a population
compute_mean <- function(distribution) {
N <- length(distribution)
sum_of_the_distribution = 0
for ( i in 1:N) {
sum_of_the_distribution <- sum_of_the_distribution + distribution[i]
sum_of_the_distribution / N
computed_mean <- compute_mean(houses$SalePrice)
r_mean <- mean(houses$SalePrice)
means_are_equal <- (computed_mean == r_mean)

library(tibble)
library(ggplot2)
library(purrr)
set.seed(4)
parameter <- mean(houses$SalePrice)
sample_sizes <- seq(5, by=29, length.out=100)
sampling_errors <- map_dbl(sample_sizes,
function(x) parameter - mean(sample(houses$SalePrice,
size=x)) )
df <- tibble(x = sample_sizes, y = sampling_errors)
ggplot(data = df, aes(x = sample_sizes, y = sampling_errors)) +
geom_point(size=2) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 2930) +
labs(x = "Sample size",
y = "Sampling error")
library(tibble)
library(ggplot2)
set.seed(1)
mean_points <- replicate(n = 10000,
expr = mean(sample(houses$SalePrice,
size = 100)))
ggplot(data = tibble(mean_points), aes(x = mean_points)) +
geom_histogram(bins = 100,
position = "identity",
alpha = 0.5) +
geom_vline(aes(xintercept = mean(houses$SalePrice))) +
xlab("Sample mean") +
ylab("Frequency") +
xlim(0, 500000)
population <- c(3, 7, 2)
library(purrr)
samples <- list(c(3, 7),
c(3, 2),
c(7, 2),
c(7, 3),
c(2, 3),
c(2, 7))
sample_means <- map_dbl(samples, function(x) mean(x))
population_mean <- mean(population)
mean_of_sample_means <- mean(sample_means)
unbiased <- (population_mean == mean_of_sample_means)

R: Random Samples and Permutations (ethz.ch)
R: Arithmetic Mean (ethz.ch)
R: Apply a Function over a List or Vector (ethz.ch)
Machine Learning Introduction with Python
Introduction to K-Nearest Neighbors

stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
mean_price = dc_listings.iloc[0:5]['price'].mean()
print(mean_price)
# Brought along the changes we made to the `dc_listings` Dataframe.
dc_listings = pd.read_csv('dc_airbnb.csv')
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
def predict_price(new_listing):
temp_df = dc_listings.copy()
## Complete the function.
return(new_listing)
acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)
def predict_price(new_listing):
temp_df = dc_listings.copy()
temp_df['distance'] = temp_df['accommodates'].apply(lambda x: np.abs(x - new_listing))
temp_df = temp_df.sort_values('distance')
nearest_neighbors = temp_df.iloc[0:5]['price']
predicted_price = nearest_neighbors.mean()
return(predicted_price)
acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)
print(acc_one)
print(acc_two)
print(acc_four)
k-nearest neighbors algorithm - Wikipedia
Five most popular similarity measures implementation in python (dataaspirant.com)

Machine Learning Tutorial Using K-NN | Dataquest

Data Analyst in R

Uploaded by

Copyright:

Available Formats

Data Analyst in R

Uploaded by

Document Information

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Data Analyst in R

Uploaded by

Copyright:

Available Formats

Introduction to Data Analysis in R

R: Arithmetic Operators (ethz.ch)

2100 + 1057.5 + 1600

12.10 * 200 + 24.90 * 50 + 4.99 * 500

total_selling_cost <- 12.10 * 200 + 24.90 * 50 + 4.99 * 500

q_1 <- class(total_purchase_cost)

q_2 <- class(total_selling_cost)

q_3 <- class(profit_1)

R: Logical Operators (ethz.ch)

filter(Race == "All Races" & Sex == "Both Sexes")

ggplot(aes(x = Year, y = Avg_Life_Expec)) +

title = "United States Life Expectancy: 100 Years of Change",

y = "Average Life Expectancy (Years)"

# Supply col_types = cols() to suppress column specification output

recent_grads <- read_csv("recent_grads.csv", col_types = cols())

lower_case_colnames <- str_to_lower(colnames(recent_grads))

colnames(recent_grads) <- lower_case_colnames

input_sentence <- "You'll need to split this sentence."

output1 <- str_split(input_sentence, " ")[[1]]

output2 <- output1[1:4]

Regular expressions • stringr (tidyverse.org)

Regular-Expressions.info - Regex Tutorial, Examples and Reference - Regexp Patterns

Working with APIs

# This is the latitude and longitude of New York City.

parameters = {"lat": 40.71, "lon": -74}

# Make a get request with the parameters.

response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)

# This gets the same data as the command above

parameters = {"lat": 37.78, "lon": -122.41}

response = requests.get("http://api.open-notify.org/iss-pass.json", params=parameters)

# Make a list of fast food chains.

best_food_chains = ["Taco Bell", "Shake Shack", "Chipotle"]

# Import the JSON library.

# Use json.dumps to convert best_food_chains to a string.

"Pizza Hut": 7600

# We can also dump a dictionary to a string and load it.

19.2. json — JSON encoder and decoder — Python 3.6.15 documentation

headers = {"Authorization": "token 1f36137fbbe1602f779300dad26e4c1b7fbab631"}

# Make a GET request to the GitHub API with our headers.

# This API endpoint will give us details about Vik Paruchuri.

response = requests.get("https://api.github.com/users/VikParuchuri/orgs", headers=headers)

# We've loaded headers in.

response = requests.get("https://api.github.com/users/torvalds", headers=headers)

# Enter your answer here.

params = {"per_page": 50, "page": 1}

response = requests.get("https://api.github.com/users/VikParuchuri/starred", headers=headers,

params = {"per_page": 50, "page": 2}

response = requests.get("https://api.github.com/users/VikParuchuri/starred", headers=headers,

response = requests.get("https://api.github.com/user", headers=headers)

payload = {"name": "test"}

# We need to pass in our authentication headers!

response = requests.post("https://api.github.com/user/repos", json=payload, headers=headers)

payload = {"name": "learning-about-apis"}

response = requests.post("https://api.github.com/user/repos", json=payload, headers=headers)

payload = {"description": "The best repository ever!", "name": "test"}

response = requests.patch("https://api.github.com/repos/VikParuchuri/test", json=payload,

payload = {"description": "Learning about requests!", "name": "learning-about-apis"}

Sign in to GitHub · GitHub

GitHub REST API - GitHub Docs

for article in python_top_articles: