NLP Exp03
NLP Exp03
NLP Exp03
Experiment Number 3
Experiment Title To study N Gram Model to calculate bigrams from a given corpus and
calculate probability of a sentence.
Resources / Hardware: Computer System Programming language:
Apparatus Required Web IDE: Goggle colab python
Probability of a sentence
Bigrams
Eg: Corpus - (eos) You book a flight (eos) I read a book (eos) You read
(eos)
Bigram Table:
Program import re
bigramProbability = []
uniqueWords = []
def preprocess(corpus):
corpus = 'eos ' + corpus.lower()
corpus = corpus.replace('.', ' eos')
return corpus
def generate_tokens(corpus):
corpus = re.sub(r'[^a-zA-Z0-9\s]', ' ', corpus)
tokens = [token for token in corpus.split(" ") if token != ""]
return tokens
def generate_word_counts(wordList):
wordCount = {}
for word in wordList:
if word not in wordCount:
wordCount.update({word: 1})
else:wordCount[word] += 1
return(wordCount)
def generate_ngrams(tokens):
ngrams = zip(*[tokens[i:] for i in range(2)])
return [" ".join(ngram) for ngram in ngrams]
def print_probability_table():
print('\nBigram Probability Table:\n')
for word in uniqueWords:
print('\t', word, end = ' ')
print()
for i in range(len(uniqueWords)):
print(uniqueWords[i], end = ' ')
probabilities = bigramProbability[i]
for probability in probabilities:
print('\t', probability, end = ' ')
print()
def generate_bigram_table(corpus):
corpus= preprocess(corpus)
tokens = generate_tokens(corpus)
Department of Computer Engineering
wordCount = generate_word_counts(tokens)
uniqueWords.extend(list(wordCount.keys()))
bigrams = generate_ngrams(tokens)
print (bigrams)
for firstWord in uniqueWords:
probabilityList = []
for secondWord in uniqueWords:
bigram = firstWord + ' ' + secondWord
probability = bigrams.count(bigram) / wordCount[firstWord]
probabilityList.append(probability)
bigramProbability.append(probabilityList)
print_probability_table()
def get_probability(sentence):
corpus = preprocess(sentence)
tokens = generate_tokens(corpus)
probability = 1
for token in range(len(tokens) -1):
firstWord = tokens[token]
secondWord = tokens[token + 1]
pairProbability= bigramProbability[uniqueWords.index(firstWord)]
[uniqueWords.index(secondWord)]
print('Probability: {1} | {0} = {2}'.format(firstWord, secondWord, pa
irProbability))
probability *= pairProbability
print('Probability of Sentence:', probability)
corpus = 'You book a flight. I read a book. You read.'
example = 'You read a book.'
print('Corpus:', corpus)
generate_bigram_table(corpus)
print('\nSentence:', example)
get_probability(example)
Department of Computer Engineering
Output