Data Analyst in R
Data Analyst in R
Data Analyst in R
125L
What is Literal? | Webopedia
FALSE
10.50 * 200L
21.15 * 50L
3.20 * 500L
12.10 * 200
24.90 * 50
4.99 * 500
Logical Expressions in R
R: Relational Operators (ethz.ch)
Data Visualization in R
library(readr)
life_expec <- read_csv("life_expec.csv")
life_expec %>%
ggplot()
life_expec_filter <- life_expec %>%
life_expec_filter %>%
geom_line() +
labs(
)
String Manipulation in R: Fundamentals
library(tidyverse)
response = requests.get("http://api.open-notify.org/iss-now.json")
status_code = response.status_code
# Enter your answer below.
response = requests.get("http://api.open-notify.org/iss-pass")
status_code = response.status_code
# Set up the parameters we want to pass to the API.
# Print the content of the response (the data the server returned)
print(response.content)
print(response.content)
content = response.content
print(type(best_food_chains))
import json
best_food_chains_string = json.dumps(best_food_chains)
print(type(best_food_chains_string))
# Convert best_food_chains_string back to a list.
print(type(json.loads(best_food_chains_string)))
# Make a dictionary
fast_food_franchise = {
"Subway": 24722,
"McDonalds": 14098,
"Starbucks": 10821,
fast_food_franchise_string = json.dumps(fast_food_franchise)
print(type(fast_food_franchise_string))
fast_food_franchise_2 = json.loads(fast_food_franchise_string)
# Headers is a dictionary
print(response.headers)
content_type = response.headers["content-type"]
# Call the API here.
response = requests.get("http://api.open-notify.org/astros.json")
json_data = response.json()
in_space_count = json_data["number"]
Requests: HTTP for Humans™ — Requests 2.26.0 documentation (python-requests.org)
# Print the content of the response. As you can see, this token corresponds to the account of Vik
Paruchuri.
print(response.json())
orgs = response.json()
torvalds = response.json()
hello_world = response.json()
page1_repos = response.json()
page2_repos = response.json()
# Enter your code here.
# Create the data we'll pass into the API endpoint. While this endpoint only requires the "name" key,
there are other optional keys.
print(response.status_code)
print(response.status_code)
response = requests.patch("https://api.github.com/repos/VikParuchuri/learning-about-apis",
json=payload, headers=headers)
status = response.status_code
user = response.json()
response = requests.delete("https://api.github.com/repos/VikParuchuri/test", headers=headers)
print(response.status_code)
response = requests.delete("https://api.github.com/repos/VikParuchuri/learning-about-apis",
headers=headers)
status = response.status_code
response = requests.delete("https://api.github.com/repos/VikParuchuri/test", headers=headers)
print(response.status_code)
response = requests.delete("https://api.github.com/repos/VikParuchuri/learning-about-apis",
headers=headers)
status = response.status_code
most_upvoted = ""
most_upvotes = 0
ar = article["data"]
most_upvoted = ar["id"]
most_upvotes = ar["ups"]
headers = {"Authorization": "bearer 13426216-4U1ckno9J5AiK72VRbpEeBaMSKk", "User-Agent":
"Dataquest/1.0"}
comments = response.json()
payload = {"dir": 1, "id": "d16y4ry"}
status = response.status_code
Web Scraping
parser = BeautifulSoup(content, 'html.parser')
body = parser.find_all("body")
p = body[0].find_all("p")
print(p[0].text)
head = parser.find_all("head")
title = head[0].find_all("title")
title_text = title[0].text
# Get the website that contains classes.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/simple_classes.html")
content = response.content
print(first_inner_paragraph.text)
second_inner_paragraph_text = second_inner_paragraph.text
first_outer_paragraph_text = first_outer_paragraph.text
# Get the website that contains classes and IDs.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
content = response.content
first_items = parser.select(".first-item")
# Print the text of the first paragraph (the first element with the first-item class).
print(first_items[0].text)
first_outer_text = parser.select(".outer-text")[0].text
second_text = parser.select("#second")[0].text
response = requests.get("http://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
content = response.content
first_items = parser.select(".first-item")
# Print the text of the first paragraph (the first element with the first-item class).
print(first_items[0].text)
first_outer_text = parser.select(".outer-text")[0].text
second_text = parser.select("#second")[0].text
# Get the Superbowl box score data.
response = requests.get("http://dataquestio.github.io/web-scraping-pages/2014_super_bowl.html")
content = response.content
turnovers = parser.select("#turnovers")[0]
seahawks_turnovers = turnovers.select("td")[1]
seahawks_turnovers_count = seahawks_turnovers.text
print(seahawks_turnovers_count)
patriots_total_plays_count = parser.select("#total-plays")[0].select("td")[2].text
seahawks_total_yards_count = parser.select("#total-yards")[0].select("td")[1].text
HTML basics - Learn web development | MDN (mozilla.org)
# ans = "B"
# ans = "C"
ans = "A"
best_churn["scaled_tran"] = (best_churn["nr_of_transactions"] \
- best_churn["nr_of_transactions"].min()) \
/ (best_churn["nr_of_transactions"].max() \
- best_churn["nr_of_transactions"].min())
best_churn["scaled_amount"] = (best_churn["amount_spent"] \
-best_churn["amount_spent"].min()) \
/ (best_churn["amount_spent"].max() \
- best_churn["amount_spent"].min())
best_churn["score"] = 100*(.5*best_churn["scaled_tran"] \
+ .5*best_churn["scaled_amount"])
top_50_churned.to_csv("best_customers.txt")
Communicating Results
import pandas as pd
playstore = pd.read_csv("googleplaystore.csv")
print(playstore.shape)
paid.drop_duplicates("App", inplace=True)
print(paid.duplicated("App").sum())
paid.reset_index(inplace=True, drop=True)
affordable_apps = paid[paid["Price"]<50].copy()
affordable_apps["affordability"] = affordable_apps.apply(
axis=1
affordable_apps.loc[reasonable,"price_criterion"] = affordable_apps["Price"].apply(
)
affordable_apps["genre_count"] = affordable_apps["Genres"].str.count(";")+1
genres_mean = affordable_apps.groupby(
["affordability", "genre_count"]
).mean()[["Price"]]
def label_genres(row):
labels the apps that cost less than its segment's mean with `1`
aff = row["affordability"]
gc = row["genre_count"]
price = row["Price"]
else:
return 0
affordable_apps["genre_criterion"] = affordable_apps.apply(
label_genres, axis="columns"
categories_mean = affordable_apps.groupby(
["affordability", "Category"]
).mean()[["Price"]]
def label_categories(row):
labels the apps that cost less than its segment's mean with `1`
aff = row["affordability"]
cat = row["Category"]
price = row["Price"]
return 1
else:
return 0
affordable_apps["category_criterion"] = affordable_apps.apply(
label_categories, axis="columns"
)
affordable_apps["Result"] = affordable_apps[criteria].mode(axis='columns')
def new_price(row):
if row["affordability"] == "cheap":
else:
total_impact = affordable_apps["Impact"].sum()
print(total_impact)
ans1 = "no"
ans21 = "no"
ans22 = "yes"
import pandas as pd
# arange = __import__("numpy").arange
# Ellipse = __import__("matplotlib").patches.Ellipse
# ax.get_xticks()
# ax.set_xticklabels(yearmonths[2::3])
# )
# ax.add_artist(circle)
# ax.xaxis.label.set_visible(False)
# ax.spines['top'].set_visible(False)
# ax.spines['right'].set_visible(False)
# ax.get_legend().remove()
import datetime as dt
def get_customers(yearmonth):
year = yearmonth//100
month = yearmonth-year*100
churn["total_customers"] = churn["yearmonth"].apply(get_customers)
churn["yearmonth"] = churn["yearmonth"].astype(str)
arange = __import__("numpy").arange
Ellipse = __import__("matplotlib").patches.Ellipse
ax.set_xticklabels(yearmonths[2::3])
ax.add_artist(circle)
ax.xaxis.label.set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.get_legend().remove()
The Mean
set.seed(1)
round(sum(distribution - mean)) == 0
N <- length(distribution)
sum_of_the_distribution = 0
for ( i in 1:N) {
sum_of_the_distribution / N
}
mean_1 <- compute_mean(distribution_1)
library(readr)
one <- TRUE # every column that describes years is measured on an interval scale
three <- TRUE # The data set has less values than the initial one with 3970 rows which we don't
know either whether it represents a population
compute_mean <- function(distribution) {
N <- length(distribution)
sum_of_the_distribution = 0
for ( i in 1:N) {
sum_of_the_distribution / N
library(purrr)
set.seed(4)
size=x)) )
geom_point(size=2) +
geom_hline(yintercept = 0) +
geom_vline(xintercept = 2930) +
y = "Sampling error")
library(tibble)
library(ggplot2)
set.seed(1)
expr = mean(sample(houses$SalePrice,
size = 100)))
geom_histogram(bins = 100,
position = "identity",
alpha = 0.5) +
geom_vline(aes(xintercept = mean(houses$SalePrice))) +
xlab("Sample mean") +
ylab("Frequency") +
xlim(0, 500000)
population <- c(3, 7, 2)
library(purrr)
samples <- list(c(3, 7),
c(3, 2),
c(7, 2),
c(7, 3),
c(2, 3),
c(2, 7))
dc_listings['price'] = stripped_dollars.astype('float')
mean_price = dc_listings.iloc[0:5]['price'].mean()
print(mean_price)
dc_listings = pd.read_csv('dc_airbnb.csv')
stripped_commas = dc_listings['price'].str.replace(',', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings = dc_listings.loc[np.random.permutation(len(dc_listings))]
def predict_price(new_listing):
temp_df = dc_listings.copy()
return(new_listing)
acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)
def predict_price(new_listing):
temp_df = dc_listings.copy()
temp_df = temp_df.sort_values('distance')
nearest_neighbors = temp_df.iloc[0:5]['price']
predicted_price = nearest_neighbors.mean()
return(predicted_price)
acc_one = predict_price(1)
acc_two = predict_price(2)
acc_four = predict_price(4)
print(acc_one)
print(acc_two)
print(acc_four)
k-nearest neighbors algorithm - Wikipedia