Professional Documents
Culture Documents
Rohan Panda 1841012123 CSE D IR LAB ASSIGNMENT
Rohan Panda 1841012123 CSE D IR LAB ASSIGNMENT
AND RESEARCH
(SOA Deemed to be University)
Submitted By
Name: SANDEEP BEHERA
Registration No.: 1841012480
Branch :CSE
Semester:7th Section :D
Content
REGD NO - 1841012123
# uncomment following lines to take three numbers from user #num1 = float(input("Enter first number: "))
#num2 = float(input("Enter second number: ")) #num3 = float(input("Enter third number: "))
38.5
#q5 initialize two matrices and perform the following operations over them using switch transpose ,addition,subt
rows = int(input("Enter the Number of rows : " )) column = int(input("Enter the Number of Columns: "))
for i in range(rows):
for j in range(column):
result[i][j] = matrix_a[i][j]+matrix_b[i][j]
def switch():
print("Press 1 for Addittion \nPress 2 for Subtraction \nPress 3 for Multiplication \nPress 4 for Division")
option = int(input("Enter your option: "))
if option == 1:
result = matrix_a+matrix_b print("Addition : ", result)
elif option == 2:
result = matrix_a-matrix_b print("Subtraction : ",result)
elif option == 3:
result = matrix_a*matrix_b print("Multiplication : ", result)
elif option == 4:
result = matrix_a/matrix_b print("Division : ",result)
else:
print("Invalid Value") switch()
#Q7 split a string into two substring text = 'Ms Dhoni is my inspiration' print(text.split())
#q8 Write a Python program to count the number of strings where the string length is 2 or more and the first and
def match_words(words): ctr = 0
#Q9 Write a Python program to get a list, sorted in increasing order by the last element in each tuple from a gi
def last(n): return n[-1]
def sort_list_last(tuples):
return sorted(tuples, key=last)
print(sort_list_last([(2, 5), (1, 2), (4, 4), (2, 3), (2, 1)]))
[(2, 1), (1, 2), (2, 3), (4, 4), (2, 5)]
#Q10 Write a Python program to generate and print a list of first and last 5 elements where the values are squar
def printValues(): l = list()
for i in range(1,21): l.append(i**2)
print(l[:5])
print(l[-5:])
printValues()
#Q11 Write a Python program to convert a list of multiple integers into a single integer.
L = [11, 33, 50]
print("Original List: ",L)
x = int("".join(map(str, L))) print("Single Integer: ",x)
#Q13 Write a Python script to generate and print a dictionary that contains a number (between 1 and n) in the f
n=int(input("Input a number ")) d = dict()
print(d)
Input a number 7
{1: 1, 2: 4, 3: 9, 4: 16, 5: 25, 6: 36, 7: 49}
#Q14 Write a Python program to iterate over dictionaries using for loops.
d = {'Red': 1, 'Green': 2, 'Blue': 3}
for color_key, value in d.items():
print(color_key, 'corresponds to ', d[color_key])
Red corresponds to 1
Green corresponds to 2
Blue corresponds to 3
#Q15 Write a Python program to create and display all combinations of letters, selecting each letter from a diff
import itertools
d ={'1':['a','b'], '2':['c','d']}
for combo in itertools.product(*[d[k] for k in sorted(d.keys())]): print(''.join(combo))
ac
ad
bc
bd
#Q16 Write a Python program to unzip a list of tuples into individual lists.
l = [(1,2), (3,4), (8,9)]
print(list(zip(*l)))
#Q17 Write a Python program to remove an empty tuple(s) from a list of tuples
L = [(), (), ('',), ('a', 'b'), ('a', 'b', 'c'), ('d')]
L = [t for t in L if t] print(L)
#Q18 Write a Python function that accepts a string and calculate the number of upper case letters and lower case
def string_test(s): d={"UPPER_CASE":0, "LOWER_CASE":0}
for c in s:
if c.isupper(): d["UPPER_CASE"]+=1
elif c.islower(): d["LOWER_CASE"]+=1
else:
pass
print ("Original String : ", s)
print ("No. of Upper case characters : ", d["UPPER_CASE"]) print ("No. of Lower case Characters : ",
d["LOWER_CASE"])
string_test('Heyy My Rowdy Boys And Girls')
IR LAB ASSIGNMENT 2
# qno 1 INitialize the following term incidence matrix process the query
doc=["Anthony & Cleopatra" , "Julis Ceaser" , "The Tempest" , "Hamlet" , "Othello" , "Macbeth"] fin=["Antony" ,
"Brutue" , "Ceaser" , "Calpumia" , "Cleopatra" , "Mercy" , "Worser"]
mat=[] final_matrix=[]
mat.append([1,1,0,0,0,1])
mat.append([1,1,0,1,0,0])
mat.append([1,1,0,1,1,1])
mat.append([0,1,0,0,0,0])
mat.append([1,0,0,0,0,0])
mat.append([1,0,1,1,1,1])
mat.append([1,0,1,1,1,0])
mat.append([1,0,0,1,0,0])
for x in mat:
print(x)
for x in range(0,6):
if mat[1][x]==1 and mat[2][x] and mat[3][x]!=1: final_matrix.append(1)
else:
final_matrix.append(0) print("The Matrix is : ") print(final_matrix) print("Final document is : ") for x in range
(6):
if final_matrix[x]==1: print(doc[x])
#qno 2 Given the four documents #Doc1="Breakthrough drug for Schizophernia" #Doc2="New Schizophemia drug"
#Doc3="New approach for treatment of Schizophernia" #Doc4="New hopes for Schizophernia patients"
# GENERATE THE TERM DOCUMENT INCIDENCE MATRIX
Doc1="Breakthrough drug for Schizophernia" Doc2="New Schizophemia drug"
Doc3="New approach for treatment of Schizophernia" Doc4="New hopes for Schizophernia patients" d1=Doc1.split(" ")
d2=Doc2.split(" ") d3=Doc3.split(" ") d4=Doc4.split(" ") words=[]
for i in d1 :
words.append(i)
for i in d2 :
words.append(i)
for i in d3 :
words.append(i)
for i in d4 :
words.append(i) dset = set(words) words = list(dset)
words = sorted(words, key=str.lower)
mat = []
for x in range(len(words)): temp = [] temp.append(words[x])
if (Doc1.find(words[x]) == -1): temp.append(0)
else:
temp.append(1)
if (Doc2.find(words[x]) == -1): temp.append(0)
else:
temp.append(1)
if (Doc3.find(words[x]) == -1): temp.append(0)
else:
temp.append(1)
if (Doc4.find(words[x]) == -1): temp.append(0)
else:
temp.append(1) mat.append(temp)
print("Term Document Incident Matrix is :\n") print('{:<15s}{:^8s}{:^8s}{:^8s}{:^8s}'.format("", "Doc1", "Doc2",
"Doc3", "Doc4")) for x in mat:
print('{:<15s}{:^8d}{:^8d}{:^8d}{:^8d}'.format(x[0], x[1], x[2], x[3], x[4]))
#q3 Construct the inverted index for the document given below
Doc1="Breakthrough drug for Schizophernia" Doc2="New Schizophemia drug"
Doc3="New approach for treatment of Schizophernia" Doc4="New hopes for Schizophernia patients" d1=Doc1.split(" ")
d2=Doc2.split(" ") d3=Doc3.split(" ") d4=Doc4.split(" ") words=[]
for i in d1 :
words.append(i)
for i in d2 :
words.append(i)
for i in d3 :
words.append(i)
for i in d4 :
words.append(i) dset = set(words) words = list(dset)
words = sorted(words, key=str.lower)
mat = []
for x in range(len(words)): temp = [] temp.append(words[x])
if (Doc1.find(words[x]) != -1): temp.append("1")
if (Doc2.find(words[x]) != -1): temp.append("2")
if (Doc3.find(words[x]) != -1): temp.append("3")
if (Doc4.find(words[x]) != -1): temp.append("4")
mat.append(temp) print("Inverted Index is :\n") for x in mat:
print(x)
Inverted Index is :
['approach', '3']
['Breakthrough', '1']
['drug', '1', '2']
['for', '1', '3', '4']
['hopes', '4']
['New', '2', '3', '4']
['of', '3']
['patients', '4']
['Schizophemia', '2']
['Schizophernia', '1', '3', '4']
['treatment', '3']
mat = []
for x in range(len(words)): temp = [] temp.append(words[x]) c=0
if (Doc1.find(words[x]) != -1): temp.append(1)
c=c+1
if (Doc2.find(words[x]) != -1): temp.append(2)
c=c+1
if (Doc3.find(words[x]) != -1): temp.append(3)
c=c+1
if (Doc4.find(words[x]) != -1): temp.append(4)
c=c+1
temp[0] = temp[0] + " [" + str(c) + "] -> " mat.append(temp)
print("Sorted Inverted Index is :\n")
for x in mat:
for y in x:
print(y, end=" ") print()
#q5 process the query brutus and calpurnia using intersect algorithm
brutus = [1,2,4,11,31,45,173,174]
calpurnia = [2,31,54,101]
print(intersect(brutus,calpurnia))
[2, 31]
#q6 implement the intersection algorithm by using skip pointers for the two posting list
T1 = [8, 16, 19, 23, 25, 28, 43, 71, 81]
T2 = [8, 41, 57, 60, 71] ANSWER = []
i, ip, j, jp, k = 0, 0, 0, 0, 3
while i < len(T1) and j < len(T2):
if T1[i] == T2[j]:
ANSWER.append(T1[i]) i += 1
ip += 1
j += 1
jp += 1
elif T1[i] < T2[j]: ip = ip + k
if ip < len(T1) and T1[ip] <= T2[j]: ip = ip + k
while ip < len(T1) and T1[ip] <= T2[j]: i = ip
ip = ip + k
else:
i += 1
else:
jp = jp + k
if jp < len(T2) and T2[jp] <= T1[i]: jp = jp + k
while jp < len(T2) and T2[jp] <= T1[i]: j = jp
jp = jp + k
else:
j += 1 print("ANSWER =", ANSWER)
#q7 implement the intersection algorithm by using skip pointers for the above specified lists(consider k values
import math
T1 = [8, 16, 19, 23, 25, 28, 43, 71, 81]
T2 = [8, 41, 57, 60, 71] ANSWER = []
i, ip, j, jp, k1, k2 = 0, 0, 0, 0, int(math.sqrt(len(T1))), int(math.sqrt(len(T2)))
while i < len(T1) and j < len(T2):
if T1[i] == T2[j]:
ANSWER.append(T1[i]) i += 1
ip += 1
j += 1
jp += 1
elif T1[i] < T2[j]: ip = ip + k1
if ip < len(T1) and T1[ip] <= T2[j]: ip = ip + k1
while ip < len(T1) and T1[ip] <= T2[j]: i = ip
ip = ip + k1
else:
i += 1
else:
jp = jp + k2
if jp < len(T2) and T2[jp] <= T1[i]: jp = jp + k2
while jp < len(T2) and T2[jp] <= T1[i]: j = jp
jp = jp + k2
else:
j += 1 print("ANSWER =", ANSWER)
PANDA IR LAB
ASSIGNMENT 3 REGD
NO - 1841012123
#q1 Find the permuterms of a given term of the dictionary. i.e., “hello” and store the posting list of each perm
# retrieve its corresponding term.
str="hello" str1="$" permuterm= []
for i in range (len(str) +1): res = str[i:]+ str1 +str[:i] permuterm.append(res)
print("All Permutems of string ", str, "is:") print(permuterm)
print()
#q4 Calculate Jaccard’s co-efficient score between “november” and “december”using tri-gram index method.
import nltk
from nltk.util import ngrams
def intersection(lst1, lst2):
lst3 = [value for value in lst1 if value in lst2]
return lst3
def Union(lst1, lst2):
final_list = list(set(lst1) | set(lst2))
return final_list
p=list(ngrams("november", 3)) #november trigrams q=list(ngrams("december", 3)) #december trigrams
jackard_coeffiencent_score=float(len(intersection(p,q)) / len(Union(p,q))) print(jackard_coeffiencent_score)
0.3333333333333333
3
3
print("NAME\t\tSOUNDEX")
for name in list:
print("%s\t\t%s" % (name, get_soundex(name)))
NAME SOUNDEX
Spiderman S136
Ironman I655
Captain_america C135
Thor T600
Hulk H420
Hawkeye H200
#q7 Write a program to find two differently spelled proper nouns #whose soundex codes are the same.
def get_soundex(name): name = name.upper() soundex = "" soundex += name[0]
dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4", "MN":"5","R":"6", "AEIOUHWY":"."}
for char in name[1:]:
for key in dictionary.keys():
if char in key:
code = dictionary[key]
if code != soundex[-1]: soundex += code
soundex = soundex.replace(".", "") soundex = soundex[:4].ljust(4, "0") return soundex
print("Two differently spelled proper nouns whose soundex codes are the same are: ") l = ["Google", "Goggle"]
print("NAME\t\tSOUNDEX")
for name in l:
print("%s\t\t%s" % (name, get_soundex(name)))
Two differently spelled proper nouns whose soundex codes are the same
are: NAME SOUNDEX
Google G240
Goggle G240
# q8 Write a program to find two phonetically similar proper nouns whose soundex codes are different.
def get_soundex(name): name = name.upper() soundex = "" soundex += name[0]
dictionary = {"BFPV": "1", "CGJKQSXZ":"2", "DT":"3", "L":"4","MN":"5", "R":"6", "AEIOUHWY":"."}
for char in name[1:]:
for key in dictionary.keys():
if char in key:
code = dictionary[key]
if code != soundex[-1]: soundex += code
soundex = soundex.replace(".", "") soundex = soundex[:4].ljust(4, "0") return soundex
print("Two phonetically similar proper nouns whose soundex codes are different are: ") l = ["Chebyshev",
"Tchebycheff"]
print("NAME\t\t\tSOUNDEX")
for name in l:
print("%s\t\t%s" % (name, get_soundex(name)))
PANDA IR LAB
ASSIGNMENT 4 REGD
NO 1841012123
#Q1 Write a program to calculate distinct terms present within term collection using Heap’s law.
#AIR CONDITIONING
# Very funny.
#Q2 Consider any text document, pre-process it (steps like tokenization, stop word removal, stemming) and calcul
#1 TOKENIZATION
import nltk
doc_trump = "Mr Trump became president after winning the political election. Though he lost the support of some
nltk_tokens = word_tokenize(doc_trump)
print (nltk_tokens)
['Mr', 'Trump', 'became', 'president', 'after', 'winning', 'the', 'political', 'election', '.', 'Though', 'he', '
lost', 'the', 'support', 'of', 'some', 'republican', 'friends', ',', 'Trump', 'is', 'friends', 'with', 'President
', 'Putin']
doc_trump = """Mr trump became president after winning the political election,
Though he lost the support of some republican friends, Trump is friends with President Putin""" stop_words =
set(stopwords.words('english'))
word_tokens = word_tokenize(doc_trump)
print(word_tokens) print(filtered_sentence)
['Mr', 'trump', 'became', 'president', 'after', 'winning', 'the', 'political', 'election', ',', 'Though', 'he', '
lost', 'the', 'support', 'of', 'some', 'republican', 'friends', ',', 'Trump', 'is', 'friends', 'with', 'President
', 'Putin']
['Mr', 'trump', 'became', 'president', 'winning', 'political', 'election', ',', 'Though', 'lost', 'support', 'rep
ublican', 'friends', ',', 'Trump', 'friends', 'President', 'Putin']
#3 STEMMING
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize ps = PorterStemmer()
doc_trump = "Mr Trump became president after winning the political election Though he lost the support of some r
words = word_tokenize(doc_trump)
for w in words:
print(w, " : ", ps.stem(w))
Mr : mr Trump
: trump
became : becam
president : presid
after : after
winning : win
the : the
political : polit
election : elect
Though : though
he : he
lost : lost
the : the
support : support
of : of
some : some
republican : republican
friends : friend
, : ,
Trump : trump
is : is
friends : friend
with : with
President : presid
Putin : putin
# assign documents
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support of some
doc_election = "President Trump says Putin had no political interference is the election outcome. He says it was
doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime
# create object
tfidf = TfidfVectorizer()
# in matrix form
print('\ntf-idf values in matrix form:') print(result.toarray())
idf values:
after : 1.6931471805599454
as : 1.6931471805599454
became : 1.2876820724517808
by : 1.6931471805599454
career : 1.6931471805599454
claimed : 1.6931471805599454
do : 1.6931471805599454
earlier : 1.6931471805599454
election : 1.2876820724517808
elections : 1.6931471805599454
friend : 1.6931471805599454
friends : 1.6931471805599454
had : 1.2876820724517808
he : 1.2876820724517808
his : 1.6931471805599454
in : 1.6931471805599454
interference : 1.6931471805599454
is : 1.2876820724517808
it : 1.6931471805599454
lost : 1.6931471805599454
minister : 1.6931471805599454
mr : 1.6931471805599454
no : 1.6931471805599454
nothing : 1.6931471805599454
of : 1.2876820724517808
outcome : 1.6931471805599454
parties : 1.6931471805599454
political : 1.0
post : 1.6931471805599454
president : 1.0
prime : 1.6931471805599454
putin : 1.0
republican : 1.6931471805599454
russia : 1.6931471805599454
says : 1.6931471805599454
served : 1.6931471805599454
some : 1.6931471805599454
support : 1.6931471805599454
the : 1.0
though : 1.6931471805599454
to : 1.6931471805599454
trump : 1.2876820724517808
vladimir : 1.6931471805599454
was : 1.6931471805599454
who : 1.6931471805599454
winning : 1.6931471805599454
witchhunt : 1.6931471805599454
with : 1.2876820724517808
Word indexes:
{'mr': 21, 'trump': 41, 'became': 2, 'president': 29, 'after': 0, 'winning': 45, 'the': 38, 'political': 27, 'ele
ction': 8, 'though': 39, 'he': 13, 'lost': 19, 'support': 37, 'of': 24, 'some': 36, 'republican': 32, 'friends':
11, 'is': 17, 'with': 47, 'putin': 31, 'says': 34, 'had': 12, 'no': 22, 'interference': 16, 'outcome': 25, 'it':
18, 'was': 43, 'witchhunt': 46, 'by': 3, 'parties': 26, 'claimed': 5, 'friend': 10, 'who': 44, 'nothing': 23, 'to
': 40, 'do': 6, 'post': 28, 'elections': 9, 'vladimir': 42, 'russia': 33, 'served': 35, 'as': 1, 'prime': 30, 'mi
nister': 20, 'earlier': 7, 'in': 15, 'his': 14, 'career': 4}
tf-idf value:
(0, 31) 0.12805554413157658
(0, 47) 0.164894828456289
(0, 17) 0.164894828456289
(0, 11) 0.4336337670028971
(0, 32) 0.21681688350144854
(0, 36) 0.21681688350144854
(0, 24) 0.164894828456289
(0, 37) 0.21681688350144854
(0, 19) 0.21681688350144854
(0, 13) 0.164894828456289
(0, 39) 0.21681688350144854
(0, 8) 0.164894828456289
(0, 27) 0.12805554413157658
(0, 38) 0.25611108826315315
(0, 45) 0.21681688350144854
(0, 0) 0.21681688350144854
(0, 29) 0.25611108826315315
(0, 2) 0.164894828456289
(0, 41) 0.329789656912578
(0, 21) 0.21681688350144854
(1, 6) 0.17151768417912747
(1, 40) 0.17151768417912747
(1, 23) 0.17151768417912747
(1, 44) 0.17151768417912747
(1, 10) 0.17151768417912747
(1, 27) 0.20260221456046645
(1, 38) 0.20260221456046645
(1, 29) 0.20260221456046645
(1, 41) 0.1304436197642709
(2, 4) 0.24095705423107247
(2, 14) 0.24095705423107247
(2, 15) 0.24095705423107247
(2, 7) 0.24095705423107247
(2, 20) 0.24095705423107247
(2, 30) 0.24095705423107247
(2, 1) 0.24095705423107247
(2, 35) 0.24095705423107247
(2, 33) 0.24095705423107247
(2, 42) 0.24095705423107247
(2, 9) 0.24095705423107247
(2, 28) 0.24095705423107247
(2, 12) 0.18325405052000932
(2, 31) 0.28462623568423023
(2, 24) 0.18325405052000932
(2, 27) 0.14231311784211512
(2, 38) 0.14231311784211512
(2, 29) 0.28462623568423023
(2, 2) 0.18325405052000932
#Q3 Write a program to calculate cosine similarity between any two text documents. # Define the documents
doc_trump = "Mr. Trump became president after winning the political election. Though he lost the support of some
doc_election = "President Trump says Putin had no political interference is the election outcome. He says it was
doc_putin = "Post elections, Vladimir Putin became President of Russia. President Putin had served as the Prime
documents = [doc_trump, doc_election, doc_putin]
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
# OPTIONAL: Convert Sparse Matrix to Pandas Dataframe if you want to see the word frequencies.
doc_term_matrix = sparse_matrix.todense() df = pd.DataFrame(doc_term_matrix,
columns=count_vectorizer.get_feature_names(), index=['doc_trump', 'doc_election', 'doc_putin'])
df
after as became by career claimed do earlier election elections ... the though to trump vladimir was who winning
doc_trump 1 0 1 0 0 0 0 0 1 0 ... 2 1 0 2 0 0 0 1
doc_election 0 0 0 1 0 1 1 0 2 0 ... 2 0 1 1 0 1 1 0
doc_putin 0 1 1 0 1 0 0 1 0 1 ... 1 0 0 0 1 0 0 0
3 rows × 48 columns
Loading [MathJax]/jax/output/CommonHTML/fonts/TeX/fontdata.js