Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
py
def plot_resumes(plt):
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),
("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),
("data science", 60, 70), ("analytics", 90, 3),
("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),
("actionable insights", 40, 30), ("think out of the box", 45, 10),
("self-starter", 30, 50), ("customer focus", 65, 15),
("thought leadership", 35, 35)]
def text_size(total):
"""equals 8 if total is 0, 28 if total is 200"""
return 8 + total / 200 * 20
#
# n-gram models
#
def fix_unicode(text):
return text.replace(u"\u2019", "'")
def get_document():
url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
document = []
return document
def generate_using_bigrams(transitions):
current = "." # this means the next word will start a sentence
result = []
while True:
next_word_candidates = transitions[current] # bigrams (current, _)
current = random.choice(next_word_candidates) # choose one at random
result.append(current) # append it to results
if current == ".": return " ".join(result) # if "." we're done
if current == ".":
return " ".join(result)
def is_terminal(token):
return token[0] != "_"
# ignore terminals
if is_terminal(token): continue
if is_terminal(replacement):
tokens[i] = replacement
else:
tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
return expand(grammar, tokens)
def generate_sentence(grammar):
return expand(grammar, ["_S"])
#
# Gibbs Sampling
#
def roll_a_die():
return random.choice([1,2,3,4,5,6])
def direct_sample():
d1 = roll_a_die()
d2 = roll_a_die()
return d1, d1 + d2
def random_y_given_x(x):
"""equally likely to be x + 1, x + 2, ... , x + 6"""
return x + roll_a_die()
def random_x_given_y(y):
if y <= 7:
# if the total is 7 or less, the first die is equally likely to be
# 1, 2, ..., (total - 1)
return random.randrange(1, y)
else:
# if the total is 7 or more, the first die is equally likely to be
# (total - 6), (total - 5), ..., 6
return random.randrange(y - 6, 7)
def gibbs_sample(num_iters=100):
x, y = 1, 2 # doesn't really matter
for _ in range(num_iters):
x = random_x_given_y(y)
y = random_y_given_x(x)
return x, y
def compare_distributions(num_samples=1000):
counts = defaultdict(lambda: [0, 0])
for _ in range(num_samples):
counts[gibbs_sample()][0] += 1
counts[direct_sample()][1] += 1
https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 2/5
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py
return counts
#
# TOPIC MODELING
#
def sample_from(weights):
total = sum(weights)
rnd = total * random.random() # uniform between 0 and total
for i, w in enumerate(weights):
rnd -= w # return the smallest i such that
if rnd <= 0: return i # sum(weights[:(i+1)]) >= rnd
documents = [
["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]
]
K = 4
document_topic_counts = [Counter()
for _ in documents]
D = len(documents)
https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 3/5
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py
for k in range(K)])
random.seed(0)
document_topics = [[random.randrange(K) for word in document]
for document in documents]
for d in range(D):
for word, topic in zip(documents[d], document_topics[d]):
document_topic_counts[d][topic] += 1
topic_word_counts[topic][word] += 1
topic_counts[topic] += 1
if __name__ == "__main__":
document = get_document()
random.seed(0)
print "bigram sentences"
for i in range(10):
print i, generate_using_bigrams(transitions)
print
# trigrams
trigram_transitions[(prev, current)].append(next)
grammar = {
https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 4/5
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py
# topic MODELING
https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 5/5