Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                

MLPrograma1-5 Py

Download as pdf or txt
Download as pdf or txt
You are on page 1of 17

Program1:

import numpy as np
import pandas as pd
data = pd.read_csv('1finds.csv')
print(data)
concepts=data.iloc[:,0:-1].values
print("-------------------------------------------")
print (concepts)
print("----------------------------------------")
target = data.iloc[:,-1].values
print (target)
def train(concepts,target):
count=0
specific_h = concepts[0]
for i,h in enumerate(concepts):
print(i)
print(h)
if target[i] == "Yes":
for x in range(len(specific_h)):
if h[x] == specific_h[x]:
pass
else:
specific_h[x] = "?"
count = count + 1
print (f"Hypothesis after sample number:{count} processed: {specific_h}")
else:
pass
count = count + 1
print (f"Negative sample number:{count} Same Hypothesis: {specific_h}")

return specific_h

specific_h=train(concepts,target)
List than eliminate :
import pandas as pd
data = pd.read_csv("weather_dataset.csv")

data.head(10)
print (data);
def package_hypothesis(hypothesis, outcome):
ln = dict()
ln['hypothesis'] = hypothesis
ln['outcome'] = outcome
return ln

#Test hypothesises
h1 = package_hypothesis(["?","?","normal","?","?"],"yes")
h2 = package_hypothesis(["sunny","high","?","?","?"],"yes")
h3 = package_hypothesis(["rainy","?","ok","?","?"],"no")
h4 = package_hypothesis(["rainy","warm","high","?","?"],"yes")
h5 = package_hypothesis(["?","cold","?","cool","?"],"no")
h6 = package_hypothesis(["?","?","?","cool","?"],"yes")
def compare(values, hypo):
for i in range(len(values)):
if(hypo[i] != "?"):
if(values[i] != hypo[i]):
return False
return True

def list_then_eliminate(data, *hypothesis):


consistent_space = []
inconsistent_space = []

for hyp in hypothesis:


state = True
for i in range(data.shape[0]):
if(hyp['outcome'] == data.iloc[i,-1]):
if(not compare(hypo = hyp['hypothesis'],
values = list(data.iloc[i,:-1])[:-1])):
inconsistent_space.append(hyp)
state = False
break
if(state):
consistent_space.append(hyp)
return (inconsistent_space, consistent_space)
print (list_then_eliminate(data, h1,h2,h3,h4,h5,h6));
Program 2:

import numpy as np
import pandas as pd

# Loading Data from a CSV File


data = pd.DataFrame(data=pd.read_csv('2ce.csv'))

# Separating concept features from Target


concepts = data.iloc[:,0:-1].values

# Isolating target into a separate DataFrame


target =data.iloc[:,-1].values
def learn(concepts, target):
specific_h = concepts[0].copy()
general_h = [["?" for i in range(len(specific_h))] for i in
range(len(specific_h))]

# The learning iterations


for i, h in enumerate(concepts):

# Checking if the hypothesis has a positive target


if target[i] == "Yes":
for x in range(len(specific_h)):

# Change values in S & G only if values change


if h[x] != specific_h[x]:
specific_h[x] = '?'
general_h[x][x] = '?'

# Checking if the hypothesis has a positive target


if target[i] == "No":
for x in range(len(specific_h)):
print(f"specific={specific_h[x]}")
# For negative hyposthesis change values only in G
if h[x] != specific_h[x]:
general_h[x][x] = specific_h[x]
#print(f"general{x}={general_h[x][x]}")
else:
general_h[x][x] = '?'
# find indices where we have empty rows, meaning those that are
unchanged
indices = [i for i,val in enumerate(general_h) if val == ['?', '?',
'?', '?', '?', '?']]
for i in indices:
# remove those rows from general_h
general_h.remove(['?', '?', '?', '?', '?', '?'])

# Return final values


return specific_h, general_h
s_final, g_final = learn(concepts, target)
print("Final S:", s_final, sep="\n")
print("Final G:", g_final, sep="\n")

Program 4 :
import math
import csv
def load_csv(filename):
lines=csv.reader(open(filename,"r"));
dataset = list(lines)
headers = dataset.pop(0)
return dataset,headers

class Node:
def __init__(self,attribute):
self.attribute=attribute
self.children=[]
self.answer=""

def subtables(data,col,delete):
dic={}
coldata=[row[col] for row in data]
attr=list(set(coldata))

counts=[0]*len(attr)
r=len(data)
c=len(data[0])
for x in range(len(attr)):
for y in range(r):
if data[y][col]==attr[x]:
counts[x]+=1

for x in range(len(attr)):
dic[attr[x]]=[[0 for i in range(c)] for j in range(counts[x])]
pos=0
for y in range(r):
if data[y][col]==attr[x]:
if delete:
del data[y][col]
dic[attr[x]][pos]=data[y]
pos+=1
return attr,dic

def entropy(S):
attr=list(set(S))
if len(attr)==1:
return 0

counts=[0,0]
for i in range(2):
counts[i]=sum([1 for x in S if attr[i]==x])/(len(S)*1.0)

sums=0
for cnt in counts:
sums+=-1*cnt*math.log(cnt,2)
return sums

def compute_gain(data,col):
attr,dic = subtables(data,col,delete=False)

total_size=len(data)
entropies=[0]*len(attr)
ratio=[0]*len(attr)

total_entropy=entropy([row[-1] for row in data])


for x in range(len(attr)):
ratio[x]=len(dic[attr[x]])/(total_size*1.0)
entropies[x]=entropy([row[-1] for row in dic[attr[x]]])
total_entropy-=ratio[x]*entropies[x]
return total_entropy

def build_tree(data,features):
lastcol=[row[-1] for row in data]
if(len(set(lastcol)))==1:
node=Node("")
node.answer=lastcol[0]
return node

n=len(data[0])-1
gains=[0]*n
for col in range(n):
gains[col]=compute_gain(data,col)
split=gains.index(max(gains))
node=Node(features[split])
fea = features[:split]+features[split+1:]

attr,dic=subtables(data,split,delete=True)

for x in range(len(attr)):
child=build_tree(dic[attr[x]],fea)
node.children.append((attr[x],child))
return node
def print_tree(node,level):
if node.answer!="":
print(" "*level,node.answer)
return

print(" "*level,node.attribute)
for value,n in node.children:
print(" "*(level+1),value)
print_tree(n,level+2)

def classify(node,x_test,features):
if node.answer!="":
print(node.answer)
return
pos=features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)

'''Main program'''
dataset,features=load_csv("4_id3.csv")
node1=build_tree(dataset,features)

print("The decision tree for the dataset using ID3 algorithm is")
print_tree(node1,0)
testdata,features=load_csv("4_id3_test_1.csv")

for xtest in testdata:


print("The test instance:",xtest)
print("The label for test instance:",end=" ")
classify(node1,xtest,features)

Program 5:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('5user_data.csv')
df

X = df.iloc[:, [2, 3]].values


y = df.iloc[:, 4].values

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =
0.25, random_state = 0)

from sklearn.preprocessing import StandardScaler


sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

from sklearn.ensemble import RandomForestClassifier


model = RandomForestClassifier(n_estimators = 10, criterion = 'entropy',
random_state = 0)
model.fit(X_train, y_train)

y_prediction = model.predict(X_test)

from sklearn import metrics


from sklearn.metrics import confusion_matrix,accuracy_score
print('Accuracy metrics')
print('Accuracy of the SVM
is',metrics.accuracy_score(y_test,y_prediction))
print('Confusion matrix')
print(metrics.confusion_matrix(y_test,y_prediction))
print('Recall and Precison ')
print(metrics.recall_score(y_test,y_prediction,average='weighted'))
print(metrics.precision_score(y_test,y_prediction,average='weighted'))

from matplotlib.colors import ListedColormap


X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop =
X_set[:, 0].max() + 1, step = 0.01),np.arange(start = X_set[:, 1].min() -
1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1,X2,model.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape),alpha = 0.75, cmap =
ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Random Forest Classification (Training set)')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.legend()
plt.show()

from matplotlib.colors import ListedColormap


X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop =
X_set[:, 0].max() + 1, step = 0.01),np.arange(start = X_set[:, 1].min() -
1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1,X2,model.predict(np.array([X1.ravel(),
X2.ravel()]).T).reshape(X1.shape),alpha=0.75,cmap= ListedColormap(('red',
'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Random Forest Classification (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

Program 3a:
import pandas as pd
from numpy import unique
from pandas import read_csv
# summarize the number of unique values for each column using numpy
# load the dataset
data = pd.read_csv('oil-spill.csv', header=None)
# summarize the number of unique values in each column
print(data.nunique())
# delete columns with a single unique value
# define the location of the dataset
print(data.shape)
# get number of unique values for each column
counts = data.nunique()
# record columns to delete
to_del = [i for i,v in enumerate(counts) if v == 1]
print(to_del)
# drop useless columns
data.drop(to_del, axis=1, inplace=True)
print(data.shape)
__________________________________________________________________
3b:

from numpy import unique


from pandas import read_csv
import pandas as pd

# summarize the number of unique values for each column using numpy
# load the dataset
data = pd.read_csv('iris.csv', header=None)
# calculate duplicates
dups = data.duplicated()
# report if there are any duplicates
print(dups.any())
# list all duplicate rows
print(data[dups])

# delete rows of duplicate data from the dataset

print(data.shape)
# delete duplicate rows
data.drop_duplicates(inplace=True)
print(data.shape)
PROGRAM 6: Implement the naïve Bayesian classifier for a sample training data set stored as a
.CSV file. Compute the accuracy of the classifier, considering few test data sets.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

df = pd.read_csv("pima_indian.csv")
feature_col_names = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin',
'bmi', 'diab_pred', 'age']
predicted_class_names = ['diabetes']

X = df[feature_col_names].values # these are factors for the prediction


y = df[predicted_class_names].values # this is what we want to predict

xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.33)

print ('\n the total number of Training Data :',ytrain.shape)


print ('\n the total number of Test Data :',ytest.shape)

clf = GaussianNB().fit(xtrain,ytrain.ravel())
predicted = clf.predict(xtest)
predictTestData= clf.predict([[6,148,72,35,0,33.6,0.627,50]])

print('\n Confusion matrix')


print(metrics.confusion_matrix(ytest,predicted))

print('\n Accuracy of the classifier is',metrics.accuracy_score(ytest,predicted))

print('\n The value of Precision', metrics.precision_score(ytest,predicted))

print('\n The value of Recall', metrics.recall_score(ytest,predicted))

print("Predicted Value for individual Test Data:", predictTestData)


PROGRAM 7: Assuming a set of documents that need to be classified, use the naïve Bayesian Classifier
model to perform this task.
Built-in Java classes/API can be used to write the program. Calculate the accuracy, precision, and recall for
your data set

import pandas as pd

msg=pd.read_csv('naivetext.csv',names=['message','label'])

print('The dimensions of the dataset',msg.shape)

msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
print(X)
print(y)

from sklearn.model_selection import train_test_split


xtrain,xtest,ytrain,ytest=train_test_split(X,y)

print ('\n the total number of Training Data :',ytrain.shape)


print ('\n the total number of Test Data :',ytest.shape)

from sklearn.feature_extraction.text import CountVectorizer


count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
print('\n The words or Tokens in the text documents \n')
print(count_vect.get_feature_names())

df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_fe
ature_names())

from sklearn.naive_bayes import MultinomialNB


clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)

print('\n Accuracy of the classifier


is',metrics.accuracy_score(ytest,predicted))

print('\n Confusion matrix')


print(metrics.confusion_matrix(ytest,predicted))

print('\n The value of Precision',


metrics.precision_score(ytest,predicted))

print('\n The value of Recall',


metrics.recall_score(ytest,predicted))
PROGRAM 8: Write a program to construct a Bayesian network considering medical data. Use this
model to demonstrate the diagnosis of heart patients using standard Heart Disease Data Set.

import numpy as np
import pandas as pd
import csv
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination

#heartDisease = pd.read_csv('heartdiseasedata.csv')

data=pd.read_csv("heartdisease.csv",
names=['age','Gender','Family','diet','Lifestyle','cholestrol','heartdisease'])
# heartDisease = heartDisease.replace('?',np.nan)
heartDisease=pd.DataFrame(data)

from sklearn.preprocessing import LabelEncoder

heartDisease.columns

lb = LabelEncoder()
for col in heartDisease.columns:
heartDisease[col] = lb.fit_transform(heartDisease[col])

print('Sample instances from the dataset are given below')


print(heartDisease.head())

print('\n Attributes and datatypes')


print(heartDisease.dtypes)

model=BayesianModel([('age','heartdisease'),('Gender','heartdisease'),
('Family','heartdisease'),('diet','cholestrol'),('Lifestyle','diet'),
('heartdisease','cholestrol')])
print('\nLearning CPD using Maximum likelihood estimators')
model.fit(heartDisease,estimator=MaximumLikelihoodEstimator)

print('\n Inferencing with Bayesian Network:')


HeartDiseasetest_infer = VariableElimination(model)

print('For age Enter { SuperSeniorCitizen:0, SeniorCitizen:1, MiddleAged:2, Youth:3,


Teen:4 }')
print('For Gender Enter { Male:0, Female:1 }')
print('For Family History Enter { yes:1, No:0 }')
print('For diet Enter { High:0, Medium:1 }')
print('For lifeStyle Enter { Athlete:0, Active:1, Moderate:2, Sedentary:3 }')
print('For cholesterol Enter { High:0, BorderLine:1, Normal:2 }')
print('\n 1. Probability of HeartDisease given evidence= age')
q1=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'age':int(input("
Enter age"))})
print(q1)

print('\n 2. Probability of HeartDisease given evidence= cholestrol ')


q2=HeartDiseasetest_infer.query(variables=['heartdisease'],evidence={'cholestrol':int(
input("Enter Cholestrol"))})
print(q2)
PROGRAM 9: Demonstrate the working of EM algorithm to cluster a set of data stored in a .CSV file.

import matplotlib.pyplot as plt


from sklearn import datasets
import sklearn.metrics as sm
import pandas as pd
import numpy as np

iris = datasets.load_iris()

X = pd.DataFrame(iris.data)

#print(X)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
#print(X.columns)
#print("X:",x)
#print("Y:",y)
y = pd.DataFrame(iris.target)
y.columns = ['Targets']

plt.figure(figsize=(14,7))

colormap = np.array(['red', 'lime', 'black'])

plt.subplot(1, 2, 1)
plt.scatter(X.Sepal_Length,X.Sepal_Width, c=colormap[y.Targets], s=40)
plt.title('Sepal')

plt.subplot(1, 2, 2)
plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Petal')

from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
xs.sample(5)
print (xs.sample(5))
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
print (gmm.fit(xs))

y_cluster_gmm = gmm.predict(xs)
y_cluster_gmm
print(y_cluster_gmm)
plt.subplot(1, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y_cluster_gmm], s=40)
plt.title('GMM Classification')

print('Accuracy Score')
print(sm.accuracy_score(y, y_cluster_gmm))

print('confusion Matrix GMM')


print(sm.confusion_matrix(y, y_cluster_gmm))
PROGRAM 10: Demonstrate the working of SVM classifier for a suitable data set

import pandas as pd
dataset = pd.read_csv('iris.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state =
0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='linear', max_iter=-1, probability=False, random_state=0,
shrinking=True, tol=0.001, verbose=False)
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
from sklearn import metrics
print('Accuracy metrics')
print('Accuracy of the SVM is',metrics.accuracy_score(y_test,y_pred))
print('Confusion matrix')
print(metrics.confusion_matrix(y_test,y_pred))
print('Recall and Precison ')
print(metrics.recall_score(y_test,y_pred,average='weighted'))
print(metrics.precision_score(y_test,y_pred,average='weighted'))

You might also like