Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
0% found this document useful (0 votes)
140 views

Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Working With Data

The document contains code for working with data in Python including functions for making histograms, comparing distributions, generating random normal distributions, creating scatter plots, calculating correlations, rescaling data, and performing dimensionality reduction. It includes functions for bucketizing points, making histograms, plotting histograms, generating random normal values, creating scatter plots, calculating correlation matrices, grouping and transforming data, rescaling data, and reducing dimensionality.

Uploaded by

gprasadatvu
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
140 views

Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Working With Data

The document contains code for working with data in Python including functions for making histograms, comparing distributions, generating random normal distributions, creating scatter plots, calculating correlations, rescaling data, and performing dimensionality reduction. It includes functions for bucketizing points, making histograms, plotting histograms, generating random normal values, creating scatter plots, calculating correlation matrices, grouping and transforming data, rescaling data, and reducing dimensionality.

Uploaded by

gprasadatvu
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.

py

from __future__ import division


from collections import Counter, defaultdict
from functools import partial
from linear_algebra import shape, get_row, get_column, make_matrix, \
vector_mean, vector_sum, dot, magnitude, vector_subtract, scalar_multiply
from statistics import correlation, standard_deviation, mean
from probability import inverse_normal_cdf
from gradient_descent import maximize_batch
import math, random, csv
import matplotlib.pyplot as plt
import dateutil.parser

def bucketize(point, bucket_size):


"""floor the point to the next lower multiple of bucket_size"""
return bucket_size * math.floor(point / bucket_size)

def make_histogram(points, bucket_size):


"""buckets the points and counts how many in each bucket"""
return Counter(bucketize(point, bucket_size) for point in points)

def plot_histogram(points, bucket_size, title=""):


histogram = make_histogram(points, bucket_size)
plt.bar(histogram.keys(), histogram.values(), width=bucket_size)
plt.title(title)
plt.show()

def compare_two_distributions():

random.seed(0)

uniform = [random.randrange(-100,101) for _ in range(200)]


normal = [57 * inverse_normal_cdf(random.random())
for _ in range(200)]

plot_histogram(uniform, 10, "Uniform Histogram")


plot_histogram(normal, 10, "Normal Histogram")

def random_normal():
"""returns a random draw from a standard normal distribution"""
return inverse_normal_cdf(random.random())

xs = [random_normal() for _ in range(1000)]


ys1 = [ x + random_normal() / 2 for x in xs]
ys2 = [-x + random_normal() / 2 for x in xs]

def scatter():
plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.show()

def correlation_matrix(data):
"""returns the num_columns x num_columns matrix whose (i, j)th entry
is the correlation between columns i and j of data"""

_, num_columns = shape(data)

def matrix_entry(i, j):


return correlation(get_column(data, i), get_column(data, j))

return make_matrix(num_columns, num_columns, matrix_entry)

def make_scatterplot_matrix():

# first, generate some random data

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py 1/7
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py

num_points = 100

def random_row():
row = [None, None, None, None]
row[0] = random_normal()
row[1] = -5 * row[0] + random_normal()
row[2] = row[0] + row[1] + 5 * random_normal()
row[3] = 6 if row[2] > -2 else 0
return row
random.seed(0)
data = [random_row()
for _ in range(num_points)]

# then plot it

_, num_columns = shape(data)
fig, ax = plt.subplots(num_columns, num_columns)

for i in range(num_columns):
for j in range(num_columns):

# scatter column_j on the x-axis vs column_i on the y-axis


if i != j: ax[i][j].scatter(get_column(data, j), get_column(data, i))

# unless i == j, in which case show the series name


else: ax[i][j].annotate("series " + str(i), (0.5, 0.5),
xycoords='axes fraction',
ha="center", va="center")

# then hide axis labels except left and bottom charts


if i < num_columns - 1: ax[i][j].xaxis.set_visible(False)
if j > 0: ax[i][j].yaxis.set_visible(False)

# fix the bottom right and top left axis labels, which are wrong because
# their charts only have text in them
ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
ax[0][0].set_ylim(ax[0][1].get_ylim())

plt.show()

def parse_row(input_row, parsers):


"""given a list of parsers (some of which may be None)
apply the appropriate one to each element of the input_row"""
return [parser(value) if parser is not None else value
for value, parser in zip(input_row, parsers)]

def parse_rows_with(reader, parsers):


"""wrap a reader to apply the parsers to each of its rows"""
for row in reader:
yield parse_row(row, parsers)

def try_or_none(f):
"""wraps f to return None if f raises an exception
assumes f takes only one input"""
def f_or_none(x):
try: return f(x)
except: return None
return f_or_none

def parse_row(input_row, parsers):


return [try_or_none(parser)(value) if parser is not None else value
for value, parser in zip(input_row, parsers)]

def try_parse_field(field_name, value, parser_dict):


"""try to parse value using the appropriate function from parser_dict"""
parser = parser_dict.get(field_name) # None if no such entry
if parser is not None:
return try_or_none(parser)(value)
else:

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py 2/7
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py

return value

def parse_dict(input_dict, parser_dict):


return { field_name : try_parse_field(field_name, value, parser_dict)
for field_name, value in input_dict.iteritems() }

#
#
# MANIPULATING DATA
#
#

def picker(field_name):
"""returns a function that picks a field out of a dict"""
return lambda row: row[field_name]

def pluck(field_name, rows):


"""turn a list of dicts into the list of field_name values"""
return map(picker(field_name), rows)

def group_by(grouper, rows, value_transform=None):


# key is output of grouper, value is list of rows
grouped = defaultdict(list)
for row in rows:
grouped[grouper(row)].append(row)
if value_transform is None:
return grouped
else:
return { key : value_transform(rows)
for key, rows in grouped.iteritems() }

def percent_price_change(yesterday, today):


return today["closing_price"] / yesterday["closing_price"] - 1

def day_over_day_changes(grouped_rows):
# sort the rows by date
ordered = sorted(grouped_rows, key=picker("date"))
# zip with an offset to get pairs of consecutive days
return [{ "symbol" : today["symbol"],
"date" : today["date"],
"change" : percent_price_change(yesterday, today) }
for yesterday, today in zip(ordered, ordered[1:])]

#
#
# RESCALING DATA
#
#

def scale(data_matrix):
num_rows, num_cols = shape(data_matrix)
means = [mean(get_column(data_matrix,j))
for j in range(num_cols)]
stdevs = [standard_deviation(get_column(data_matrix,j))
for j in range(num_cols)]
return means, stdevs

def rescale(data_matrix):
"""rescales the input data so that each column
has mean 0 and standard deviation 1
ignores columns with no deviation"""
means, stdevs = scale(data_matrix)

def rescaled(i, j):


if stdevs[j] > 0:
return (data_matrix[i][j] - means[j]) / stdevs[j]
else:
return data_matrix[i][j]

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py 3/7
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py

num_rows, num_cols = shape(data_matrix)


return make_matrix(num_rows, num_cols, rescaled)

#
# DIMENSIONALITY REDUCTION
#

X = [
[20.9666776351559,-13.1138080189357],
[22.7719907680008,-19.8890894944696],
[25.6687103160153,-11.9956004517219],
[18.0019794950564,-18.1989191165133],
[21.3967402102156,-10.8893126308196],
[0.443696899177716,-19.7221132386308],
[29.9198322142127,-14.0958668502427],
[19.0805843080126,-13.7888747608312],
[16.4685063521314,-11.2612927034291],
[21.4597664701884,-12.4740034586705],
[3.87655283720532,-17.575162461771],
[34.5713920556787,-10.705185165378],
[13.3732115747722,-16.7270274494424],
[20.7281704141919,-8.81165591556553],
[24.839851437942,-12.1240962157419],
[20.3019544741252,-12.8725060780898],
[21.9021426929599,-17.3225432396452],
[23.2285885715486,-12.2676568419045],
[28.5749111681851,-13.2616470619453],
[29.2957424128701,-14.6299928678996],
[15.2495527798625,-18.4649714274207],
[26.5567257400476,-9.19794350561966],
[30.1934232346361,-12.6272709845971],
[36.8267446011057,-7.25409849336718],
[32.157416823084,-10.4729534347553],
[5.85964365291694,-22.6573731626132],
[25.7426190674693,-14.8055803854566],
[16.237602636139,-16.5920595763719],
[14.7408608850568,-20.0537715298403],
[6.85907008242544,-18.3965586884781],
[26.5918329233128,-8.92664811750842],
[-11.2216019958228,-27.0519081982856],
[8.93593745011035,-20.8261235122575],
[24.4481258671796,-18.0324012215159],
[2.82048515404903,-22.4208457598703],
[30.8803004755948,-11.455358009593],
[15.4586738236098,-11.1242825084309],
[28.5332537090494,-14.7898744423126],
[40.4830293441052,-2.41946428697183],
[15.7563759125684,-13.5771266003795],
[19.3635588851727,-20.6224770470434],
[13.4212840786467,-19.0238227375766],
[7.77570680426702,-16.6385739839089],
[21.4865983854408,-15.290799330002],
[12.6392705930724,-23.6433305964301],
[12.4746151388128,-17.9720169566614],
[23.4572410437998,-14.602080545086],
[13.6878189833565,-18.9687408182414],
[15.4077465943441,-14.5352487124086],
[20.3356581548895,-10.0883159703702],
[20.7093833689359,-12.6939091236766],
[11.1032293684441,-14.1383848928755],
[17.5048321498308,-9.2338593361801],
[16.3303688220188,-15.1054735529158],
[26.6929062710726,-13.306030567991],
[34.4985678099711,-9.86199941278607],
[39.1374291499406,-10.5621430853401],
[21.9088956482146,-9.95198845621849],
[22.2367457578087,-17.2200123442707],
[10.0032784145577,-19.3557700653426],
[14.045833906665,-15.871937521131],

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py 4/7
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py

[15.5640911917607,-18.3396956121887],
[24.4771926581586,-14.8715313479137],
[26.533415556629,-14.693883922494],
[12.8722580202544,-21.2750596021509],
[24.4768291376862,-15.9592080959207],
[18.2230748567433,-14.6541444069985],
[4.1902148367447,-20.6144032528762],
[12.4332594022086,-16.6079789231489],
[20.5483758651873,-18.8512560786321],
[17.8180560451358,-12.5451990696752],
[11.0071081078049,-20.3938092335862],
[8.30560561422449,-22.9503944138682],
[33.9857852657284,-4.8371294974382],
[17.4376502239652,-14.5095976075022],
[29.0379635148943,-14.8461553663227],
[29.1344666599319,-7.70862921632672],
[32.9730697624544,-15.5839178785654],
[13.4211493998212,-20.150199857584],
[11.380538260355,-12.8619410359766],
[28.672631499186,-8.51866271785711],
[16.4296061111902,-23.3326051279759],
[25.7168371582585,-13.8899296143829],
[13.3185154732595,-17.8959160024249],
[3.60832478605376,-25.4023343597712],
[39.5445949652652,-11.466377647931],
[25.1693484426101,-12.2752652925707],
[25.2884257196471,-7.06710309184533],
[6.77665715793125,-22.3947299635571],
[20.1844223778907,-16.0427471125407],
[25.5506805272535,-9.33856532270204],
[25.1495682602477,-7.17350567090738],
[15.6978431006492,-17.5979197162642],
[37.42780451491,-10.843637288504],
[22.974620174842,-10.6171162611686],
[34.6327117468934,-9.26182440487384],
[34.7042513789061,-6.9630753351114],
[15.6563953929008,-17.2196961218915],
[25.2049825789225,-14.1592086208169]
]

def de_mean_matrix(A):
"""returns the result of subtracting from every value in A the mean
value of its column. the resulting matrix has mean 0 in every column"""
nr, nc = shape(A)
column_means, _ = scale(A)
return make_matrix(nr, nc, lambda i, j: A[i][j] - column_means[j])

def direction(w):
mag = magnitude(w)
return [w_i / mag for w_i in w]

def directional_variance_i(x_i, w):


"""the variance of the row x_i in the direction w"""
return dot(x_i, direction(w)) ** 2

def directional_variance(X, w):


"""the variance of the data in the direction w"""
return sum(directional_variance_i(x_i, w) for x_i in X)

def directional_variance_gradient_i(x_i, w):


"""the contribution of row x_i to the gradient of
the direction-w variance"""
projection_length = dot(x_i, direction(w))
return [2 * projection_length * x_ij for x_ij in x_i]

def directional_variance_gradient(X, w):


return vector_sum(directional_variance_gradient_i(x_i,w) for x_i in X)

def first_principal_component(X):

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py 5/7
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py

guess = [1 for _ in X[0]]


unscaled_maximizer = maximize_batch(
partial(directional_variance, X), # is now a function of w
partial(directional_variance_gradient, X), # is now a function of w
guess)
return direction(unscaled_maximizer)

def first_principal_component_sgd(X):
guess = [1 for _ in X[0]]
unscaled_maximizer = maximize_stochastic(
lambda x, _, w: directional_variance_i(x, w),
lambda x, _, w: directional_variance_gradient_i(x, w),
X, [None for _ in X], guess)
return direction(unscaled_maximizer)

def project(v, w):


"""return the projection of v onto w"""
coefficient = dot(v, w)
return scalar_multiply(coefficient, w)

def remove_projection_from_vector(v, w):


"""projects v onto w and subtracts the result from v"""
return vector_subtract(v, project(v, w))

def remove_projection(X, w):


"""for each row of X
projects the row onto w, and subtracts the result from the row"""
return [remove_projection_from_vector(x_i, w) for x_i in X]

def principal_component_analysis(X, num_components):


components = []
for _ in range(num_components):
component = first_principal_component(X)
components.append(component)
X = remove_projection(X, component)

return components

def transform_vector(v, components):


return [dot(v, w) for w in components]

def transform(X, components):


return [transform_vector(x_i, components) for x_i in X]

if __name__ == "__main__":

print "correlation(xs, ys1)", correlation(xs, ys1)


print "correlation(xs, ys2)", correlation(xs, ys2)

# safe parsing

data = []

with open("comma_delimited_stock_prices.csv", "rb") as f:


reader = csv.reader(f)
for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
data.append(line)

for row in data:


if any(x is None for x in row):
print row

print "stocks"
with open("stocks.txt", "rb") as f:
reader = csv.DictReader(f, delimiter="\t")
data = [parse_dict(row, { 'date' : dateutil.parser.parse,
'closing_price' : float })
for row in reader]

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py 6/7
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py

max_aapl_price = max(row["closing_price"]
for row in data
if row["symbol"] == "AAPL")
print "max aapl price", max_aapl_price

# group rows by symbol


by_symbol = defaultdict(list)

for row in data:


by_symbol[row["symbol"]].append(row)

# use a dict comprehension to find the max for each symbol


max_price_by_symbol = { symbol : max(row["closing_price"]
for row in grouped_rows)
for symbol, grouped_rows in by_symbol.iteritems() }
print "max price by symbol"
print max_price_by_symbol

# key is symbol, value is list of "change" dicts


changes_by_symbol = group_by(picker("symbol"), data, day_over_day_changes)
# collect all "change" dicts into one big list
all_changes = [change
for changes in changes_by_symbol.values()
for change in changes]

print "max change", max(all_changes, key=picker("change"))


print "min change", min(all_changes, key=picker("change"))

# to combine percent changes, we add 1 to each, multiply them, and subtract 1


# for instance, if we combine +10% and -20%, the overall change is
# (1 + 10%) * (1 - 20%) - 1 = 1.1 * .8 - 1 = -12%
def combine_pct_changes(pct_change1, pct_change2):
return (1 + pct_change1) * (1 + pct_change2) - 1

def overall_change(changes):
return reduce(combine_pct_changes, pluck("change", changes))

overall_change_by_month = group_by(lambda row: row['date'].month,


all_changes,
overall_change)
print "overall change by month"
print overall_change_by_month

print "rescaling"

data = [[1, 20, 2],


[1, 30, 3],
[1, 40, 4]]

print "original: ", data


print "scale: ", scale(data)
print "rescaled: ", rescale(data)
print

print "PCA"

Y = de_mean_matrix(X)
components = principal_component_analysis(Y, 2)
print "principal components", components
print "first point", Y[0]
print "first point transformed", transform_vector(Y[0], components)

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/working_with_data.py 7/7

You might also like