0% found this document useful (0 votes)

119 views

Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing

This document contains Python code for performing various natural language processing tasks including: - Plotting word frequencies on resumes vs job postings - Generating random text using n-gram models - Performing topic modeling on documents using Gibbs sampling - Functions for processing text, building n-gram language models, and generating random sentences are defined.

Uploaded by

gprasadatvu

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

119 views

Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing

Uploaded by

gprasadatvu

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 5

9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.

from future import division

import math, random, re
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
import requests

def plot_resumes(plt):
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),
("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),
("data science", 60, 70), ("analytics", 90, 3),
("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),
("actionable insights", 40, 30), ("think out of the box", 45, 10),
("self-starter", 30, 50), ("customer focus", 65, 15),
("thought leadership", 35, 35)]

def text_size(total):
"""equals 8 if total is 0, 28 if total is 200"""
return 8 + total / 200 * 20

for word, job_popularity, resume_popularity in data:

plt.text(job_popularity, resume_popularity, word,
ha='center', va='center',
size=text_size(job_popularity + resume_popularity))
plt.xlabel("Popularity on Job Postings")
plt.ylabel("Popularity on Resumes")
plt.axis([0, 100, 0, 100])
plt.show()

#
# n-gram models
#

def fix_unicode(text):
return text.replace(u"\u2019", "'")

def get_document():

url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')

content = soup.find("div", "article-body") # find article-body div

regex = r"[\w']+|[\.]" # matches a word or a period

document = []

for paragraph in content("p"):

words = re.findall(regex, fix_unicode(paragraph.text))
document.extend(words)

return document

def generate_using_bigrams(transitions):
current = "." # this means the next word will start a sentence
result = []
while True:
next_word_candidates = transitions[current] # bigrams (current, _)
current = random.choice(next_word_candidates) # choose one at random
result.append(current) # append it to results
if current == ".": return " ".join(result) # if "." we're done

def generate_using_trigrams(starts, trigram_transitions):

current = random.choice(starts) # choose a random starting word
prev = "." # and precede it with a '.'
result = [current]
while True:
next_word_candidates = trigram_transitions[(prev, current)]
next = random.choice(next_word_candidates)
https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 1/5
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py

prev, current = current, next

result.append(current)

if current == ".":
return " ".join(result)

def is_terminal(token):
return token[0] != "_"

def expand(grammar, tokens):

for i, token in enumerate(tokens):

# ignore terminals
if is_terminal(token): continue

# choose a replacement at random

replacement = random.choice(grammar[token])

if is_terminal(replacement):
tokens[i] = replacement
else:
tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
return expand(grammar, tokens)

# if we get here we had all terminals and are done

return tokens

def generate_sentence(grammar):
return expand(grammar, ["_S"])

#
# Gibbs Sampling
#

def roll_a_die():
return random.choice([1,2,3,4,5,6])

def direct_sample():
d1 = roll_a_die()
d2 = roll_a_die()
return d1, d1 + d2

def random_y_given_x(x):
"""equally likely to be x + 1, x + 2, ... , x + 6"""
return x + roll_a_die()

def random_x_given_y(y):
if y <= 7:
# if the total is 7 or less, the first die is equally likely to be
# 1, 2, ..., (total - 1)
return random.randrange(1, y)
else:
# if the total is 7 or more, the first die is equally likely to be
# (total - 6), (total - 5), ..., 6
return random.randrange(y - 6, 7)

def gibbs_sample(num_iters=100):
x, y = 1, 2 # doesn't really matter
for _ in range(num_iters):
x = random_x_given_y(y)
y = random_y_given_x(x)
return x, y

def compare_distributions(num_samples=1000):
counts = defaultdict(lambda: [0, 0])
for _ in range(num_samples):
counts[gibbs_sample()][0] += 1
counts[direct_sample()][1] += 1

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 2/5
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py

return counts

#
# TOPIC MODELING
#

def sample_from(weights):
total = sum(weights)
rnd = total * random.random() # uniform between 0 and total
for i, w in enumerate(weights):
rnd -= w # return the smallest i such that
if rnd <= 0: return i # sum(weights[:(i+1)]) >= rnd

documents = [
["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]
]

K = 4

document_topic_counts = [Counter()
for _ in documents]

topic_word_counts = [Counter() for _ in range(K)]

topic_counts = [0 for _ in range(K)]

document_lengths = map(len, documents)

distinct_words = set(word for document in documents for word in document)

W = len(distinct_words)

D = len(documents)

def p_topic_given_document(topic, d, alpha=0.1):

"""the fraction of words in document _d_
that are assigned to _topic_ (plus some smoothing)"""

return ((document_topic_counts[d][topic] + alpha) /

(document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):

"""the fraction of words assigned to _topic_
that equal _word_ (plus some smoothing)"""

return ((topic_word_counts[topic][word] + beta) /

(topic_counts[topic] + W * beta))

def topic_weight(d, word, k):

"""given a document and a word in that document,
return the weight for the k-th topic"""

return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):

return sample_from([topic_weight(d, word, k)

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 3/5
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py

for k in range(K)])

random.seed(0)
document_topics = [[random.randrange(K) for word in document]
for document in documents]

for d in range(D):
for word, topic in zip(documents[d], document_topics[d]):
document_topic_counts[d][topic] += 1
topic_word_counts[topic][word] += 1
topic_counts[topic] += 1

for iter in range(1000):

for d in range(D):
for i, (word, topic) in enumerate(zip(documents[d],
document_topics[d])):

# remove this word / topic from the counts

# so that it doesn't influence the weights
document_topic_counts[d][topic] -= 1
topic_word_counts[topic][word] -= 1
topic_counts[topic] -= 1
document_lengths[d] -= 1

# choose a new topic based on the weights

new_topic = choose_new_topic(d, word)
document_topics[d][i] = new_topic

# and now add it back to the counts

document_topic_counts[d][new_topic] += 1
topic_word_counts[new_topic][word] += 1
topic_counts[new_topic] += 1
document_lengths[d] += 1

if __name__ == "__main__":

document = get_document()

bigrams = zip(document, document[1:])

transitions = defaultdict(list)
for prev, current in bigrams:
transitions[prev].append(current)

random.seed(0)
print "bigram sentences"
for i in range(10):
print i, generate_using_bigrams(transitions)
print

# trigrams

trigrams = zip(document, document[1:], document[2:])

trigram_transitions = defaultdict(list)
starts = []

for prev, current, next in trigrams:

if prev == ".": # if the previous "word" was a period

starts.append(current) # then this is a start word

trigram_transitions[(prev, current)].append(next)

print "trigram sentences"

for i in range(10):
print i, generate_using_trigrams(starts, trigram_transitions)
print

grammar = {

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 4/5
9/26/2017 https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py

"_S" : ["_NP _VP"],

"_NP" : ["_N",
"_A _NP _P _A _N"],
"_VP" : ["_V",
"_V _NP"],
"_N" : ["data science", "Python", "regression"],
"_A" : ["big", "linear", "logistic"],
"_P" : ["about", "near"],
"_V" : ["learns", "trains", "tests", "is"]
}

print "grammar sentences"

for i in range(10):
print i, " ".join(generate_sentence(grammar))
print

print "gibbs sampling"

comparison = compare_distributions()
for roll, (gibbs, direct) in comparison.iteritems():
print roll, gibbs, direct

# topic MODELING

for k, word_counts in enumerate(topic_word_counts):

for word, count in word_counts.most_common():
if count > 0: print k, word, count

topic_names = ["Big Data and programming languages",

"Python and statistics",
"databases",
"machine learning"]

for document, topic_counts in zip(documents, document_topic_counts):

print document
for topic, count in topic_counts.most_common():
if count > 0:
print topic_names[topic], count,
print

https://raw.githubusercontent.com/joelgrus/data-science-from-scratch/master/code/natural_language_processing.py 5/5

NEWS2 Chart 3 - NEWS Observation Chart - 0 PDF
No ratings yet
NEWS2 Chart 3 - NEWS Observation Chart - 0 PDF
1 page
APA DOC No. 2104
100% (1)
APA DOC No. 2104
409 pages
Evaluation of BIRCH Clustering Algorithm For Big Data
No ratings yet
Evaluation of BIRCH Clustering Algorithm For Big Data
5 pages
Introduction To Natural Language Processing (NLP)
No ratings yet
Introduction To Natural Language Processing (NLP)
87 pages
PYDS 3150713 Unit-2
No ratings yet
PYDS 3150713 Unit-2
38 pages
DA Masters 5523
No ratings yet
DA Masters 5523
30 pages
Natural Language processing-Regular-HO
No ratings yet
Natural Language processing-Regular-HO
10 pages
SDET
No ratings yet
SDET
4 pages
Big Data: by It Faculty Alttc Ghaziabad
No ratings yet
Big Data: by It Faculty Alttc Ghaziabad
26 pages
Introduction To Natural Language Processing (NLP) : Dr. Sukhnandan Kaur Tiet
No ratings yet
Introduction To Natural Language Processing (NLP) : Dr. Sukhnandan Kaur Tiet
51 pages
Pds Question Bank
No ratings yet
Pds Question Bank
5 pages
Big Data Hadoop Certification Training Course
No ratings yet
Big Data Hadoop Certification Training Course
12 pages
Data Science New
No ratings yet
Data Science New
9 pages
Streamlit PDF Application Setup All Commands in One Single File
No ratings yet
Streamlit PDF Application Setup All Commands in One Single File
8 pages
Data Science Training in Hyderabad
No ratings yet
Data Science Training in Hyderabad
7 pages
ARTIFICIAl iNTELLIGENCE Unit III &iv
No ratings yet
ARTIFICIAl iNTELLIGENCE Unit III &iv
39 pages
Data Analysis Library: by Muthu Priya J 19MZ06
No ratings yet
Data Analysis Library: by Muthu Priya J 19MZ06
3 pages
Job Description For AI-ML Developer
No ratings yet
Job Description For AI-ML Developer
1 page
Natural Language Processing Professional Program
No ratings yet
Natural Language Processing Professional Program
13 pages
1 Month Big Data Boot Camp
No ratings yet
1 Month Big Data Boot Camp
6 pages
DA Unit - IV
No ratings yet
DA Unit - IV
216 pages
Unit5_AI_Top AIML Tools
No ratings yet
Unit5_AI_Top AIML Tools
15 pages
PR ZXV
No ratings yet
PR ZXV
8 pages
Natural Language Processing in Investigative Journalism
No ratings yet
Natural Language Processing in Investigative Journalism
53 pages
Python GTU Study Material Presentations Unit-2 24072020062038AM
No ratings yet
Python GTU Study Material Presentations Unit-2 24072020062038AM
18 pages
Part3 ML
No ratings yet
Part3 ML
201 pages
Dms - 5e147898f022bDS and ML With Python Libraries
No ratings yet
Dms - 5e147898f022bDS and ML With Python Libraries
2 pages
Assignment - Machine Learning
No ratings yet
Assignment - Machine Learning
3 pages
Machine Learning Python Packages
No ratings yet
Machine Learning Python Packages
9 pages
PG (Purdue) Data Science
No ratings yet
PG (Purdue) Data Science
30 pages
Hadoop (Big Data) : Skills Gained
No ratings yet
Hadoop (Big Data) : Skills Gained
8 pages
Siva Ram Korakutty
No ratings yet
Siva Ram Korakutty
6 pages
Natural Language Processing
No ratings yet
Natural Language Processing
30 pages
Anaconda Training: Data Science Foundations
No ratings yet
Anaconda Training: Data Science Foundations
5 pages
Slide 11 Spark ML
No ratings yet
Slide 11 Spark ML
153 pages
Natural Language Processing
No ratings yet
Natural Language Processing
12 pages
Generative AI Research Papers
No ratings yet
Generative AI Research Papers
3 pages
Data Science For Agriculture
No ratings yet
Data Science For Agriculture
5 pages
Full Python AI Article
No ratings yet
Full Python AI Article
7 pages
Unit 5 - Compiler Design - WWW - Rgpvnotes.in
No ratings yet
Unit 5 - Compiler Design - WWW - Rgpvnotes.in
20 pages
Big Data Syllabus For Theory and Lab
No ratings yet
Big Data Syllabus For Theory and Lab
4 pages
Natural Language Processing Rahul Sahai
No ratings yet
Natural Language Processing Rahul Sahai
30 pages
Data Manipulation With Pandas
No ratings yet
Data Manipulation With Pandas
147 pages
New Dhanesh Kumar Solanki
No ratings yet
New Dhanesh Kumar Solanki
5 pages
Practical 1to10
No ratings yet
Practical 1to10
32 pages
Pytorch: Tensors and Datasets
No ratings yet
Pytorch: Tensors and Datasets
9 pages
Thyroid Disease Classification Using Machine Learning Project
No ratings yet
Thyroid Disease Classification Using Machine Learning Project
34 pages
School of Computer Science: Python For ML/Al Internship
No ratings yet
School of Computer Science: Python For ML/Al Internship
12 pages
Text Processing, Tokenization & Characteristics
No ratings yet
Text Processing, Tokenization & Characteristics
89 pages
Chapter4 (The Evaluating Multiple Models Chapter Is Really Good!)
No ratings yet
Chapter4 (The Evaluating Multiple Models Chapter Is Really Good!)
47 pages
DSML ShrreyaBehll CV
No ratings yet
DSML ShrreyaBehll CV
2 pages
NLP Lab Tasks
No ratings yet
NLP Lab Tasks
16 pages
ML_Pipelines_AI_Community
No ratings yet
ML_Pipelines_AI_Community
53 pages
NLP U5
No ratings yet
NLP U5
26 pages
20191216134846D3338 - COMP6579 Session 10 - Big Data Analytics (Apache Spark - SparkML)
No ratings yet
20191216134846D3338 - COMP6579 Session 10 - Big Data Analytics (Apache Spark - SparkML)
42 pages
PPT1
No ratings yet
PPT1
93 pages
PPB ML Notes
No ratings yet
PPB ML Notes
54 pages
ML - Expectation-Maximization Algorithm
No ratings yet
ML - Expectation-Maximization Algorithm
3 pages
Statistics For Data Science
No ratings yet
Statistics For Data Science
30 pages
(Excerpts From) Investigating Performance: Design and Outcomes With Xapi
From Everand
(Excerpts From) Investigating Performance: Design and Outcomes With Xapi
Janet Laane Effron
No ratings yet
The Definitive Guide to Data Integration: Unlock the power of data integration to efficiently manage, transform, and analyze data
From Everand
The Definitive Guide to Data Integration: Unlock the power of data integration to efficiently manage, transform, and analyze data
Pierre-yves Bonnefoy
No ratings yet
Optimizing Hadoop for MapReduce
From Everand
Optimizing Hadoop for MapReduce
Khaled Tannir
No ratings yet
Hall Ticket Cum Test Fee Receipt: Important Instructions
No ratings yet
Hall Ticket Cum Test Fee Receipt: Important Instructions
1 page
Regex - Extract Pattern From String, Strip Text, Convert To Numeric and Sum in R Data
No ratings yet
Regex - Extract Pattern From String, Strip Text, Convert To Numeric and Sum in R Data
2 pages
Human Resource Planning IGNOU All in One
No ratings yet
Human Resource Planning IGNOU All in One
202 pages
Cheat Sheets of Python Libraries Tensorflow
No ratings yet
Cheat Sheets of Python Libraries Tensorflow
3 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Working With Data
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Working With Data
7 pages
RPubs - Text-Mining With Rvest and Qdap
No ratings yet
RPubs - Text-Mining With Rvest and Qdap
17 pages
Know Thy Complexities!: Big-O Complexity Chart
No ratings yet
Know Thy Complexities!: Big-O Complexity Chart
2 pages
Q
No ratings yet
Q
2 pages
SDFG
No ratings yet
SDFG
4 pages
MSC in Applied Data Science & Big Data - Data ScienceTech Institute
No ratings yet
MSC in Applied Data Science & Big Data - Data ScienceTech Institute
8 pages
Infographic Study Abroad en
No ratings yet
Infographic Study Abroad en
1 page
Course Outline: SC Ence
No ratings yet
Course Outline: SC Ence
1 page
Lab 1 Compresive Test PDF
No ratings yet
Lab 1 Compresive Test PDF
10 pages
Final Drawing: Anti-Fouling System
No ratings yet
Final Drawing: Anti-Fouling System
11 pages
Include
No ratings yet
Include
3 pages
FS5 Field Study Learning Episode: The K To 12 Grading System
No ratings yet
FS5 Field Study Learning Episode: The K To 12 Grading System
9 pages
Max Performance Cheat Code
No ratings yet
Max Performance Cheat Code
19 pages
Economics of Food Safety: Basic
No ratings yet
Economics of Food Safety: Basic
50 pages
Inf Sta3
No ratings yet
Inf Sta3
15 pages
Australia New Zealand Food Standards Code - Standard 1.2.7 - Nutrition, Health and Related Claims
No ratings yet
Australia New Zealand Food Standards Code - Standard 1.2.7 - Nutrition, Health and Related Claims
12 pages
9-The Three-Point Estimating Technique
100% (1)
9-The Three-Point Estimating Technique
4 pages
Bahasa Inggris
No ratings yet
Bahasa Inggris
2 pages
Measurement of Force, Torque and Shaft Power
No ratings yet
Measurement of Force, Torque and Shaft Power
12 pages
Loan Policy 1
No ratings yet
Loan Policy 1
13 pages
Service Manual: Nsx-Ds8
No ratings yet
Service Manual: Nsx-Ds8
44 pages
Literature Review
100% (1)
Literature Review
58 pages
Hrf628af6 PDF
No ratings yet
Hrf628af6 PDF
34 pages
Mat 102
No ratings yet
Mat 102
1 page
Blockchain: Research and Applications: Lodovica Marchesi, Michele Marchesi, Roberto Tonelli, Maria Ilaria Lunesu
No ratings yet
Blockchain: Research and Applications: Lodovica Marchesi, Michele Marchesi, Roberto Tonelli, Maria Ilaria Lunesu
13 pages
Clase 2 Rptas
No ratings yet
Clase 2 Rptas
45 pages
JCU Practice Questions 1
No ratings yet
JCU Practice Questions 1
6 pages
Unit 3 Dreams
0% (1)
Unit 3 Dreams
3 pages
Unisyst Quarter Report 30-09-2024
No ratings yet
Unisyst Quarter Report 30-09-2024
10 pages
Invertis University, Bareilly: First Shift - Block 3 Second Shift - Block 1 & 2 Odd Semester Examination 2018-19
No ratings yet
Invertis University, Bareilly: First Shift - Block 3 Second Shift - Block 1 & 2 Odd Semester Examination 2018-19
14 pages
Server-Side Web Programming: Introduction To Sessions
No ratings yet
Server-Side Web Programming: Introduction To Sessions
24 pages
A Study On Perception of Life Insurance Agency As A Career For Bajaj Allianz Life Insurance Company Limited"
0% (1)
A Study On Perception of Life Insurance Agency As A Career For Bajaj Allianz Life Insurance Company Limited"
12 pages
BMEP Plus v1 0
100% (1)
BMEP Plus v1 0
6 pages
Machine Maintenance Log Sample
100% (2)
Machine Maintenance Log Sample
24 pages
Specifications: Product Description
No ratings yet
Specifications: Product Description
2 pages
Classification of Articulators: Awni Rihani, D.D.S., M.Sc.
No ratings yet
Classification of Articulators: Awni Rihani, D.D.S., M.Sc.
4 pages