MLPrograma1-5 Py
MLPrograma1-5 Py
MLPrograma1-5 Py
import numpy as np
import pandas as pd
data = pd.read_csv('1finds.csv')
print(data)
concepts=data.iloc[:,0:-1].values
print("-------------------------------------------")
print (concepts)
print("----------------------------------------")
target = data.iloc[:,-1].values
print (target)
def train(concepts,target):
count=0
specific_h = concepts[0]
for i,h in enumerate(concepts):
print(i)
print(h)
if target[i] == "Yes":
for x in range(len(specific_h)):
if h[x] == specific_h[x]:
pass
else:
specific_h[x] = "?"
count = count + 1
print (f"Hypothesis after sample number:{count} processed: {specific_h}")
else:
pass
count = count + 1
print (f"Negative sample number:{count} Same Hypothesis: {specific_h}")
return specific_h
specific_h=train(concepts,target)
List than eliminate :
import pandas as pd
data = pd.read_csv("weather_dataset.csv")
data.head(10)
print (data);
def package_hypothesis(hypothesis, outcome):
ln = dict()
ln['hypothesis'] = hypothesis
ln['outcome'] = outcome
return ln
#Test hypothesises
h1 = package_hypothesis(["?","?","normal","?","?"],"yes")
h2 = package_hypothesis(["sunny","high","?","?","?"],"yes")
h3 = package_hypothesis(["rainy","?","ok","?","?"],"no")
h4 = package_hypothesis(["rainy","warm","high","?","?"],"yes")
h5 = package_hypothesis(["?","cold","?","cool","?"],"no")
h6 = package_hypothesis(["?","?","?","cool","?"],"yes")
def compare(values, hypo):
for i in range(len(values)):
if(hypo[i] != "?"):
if(values[i] != hypo[i]):
return False
return True
import numpy as np
import pandas as pd
Program 4 :
import math
import csv
def load_csv(filename):
lines=csv.reader(open(filename,"r"));
dataset = list(lines)
headers = dataset.pop(0)
return dataset,headers
class Node:
def __init__(self,attribute):
self.attribute=attribute
self.children=[]
self.answer=""
def subtables(data,col,delete):
dic={}
coldata=[row[col] for row in data]
attr=list(set(coldata))
counts=[0]*len(attr)
r=len(data)
c=len(data[0])
for x in range(len(attr)):
for y in range(r):
if data[y][col]==attr[x]:
counts[x]+=1
for x in range(len(attr)):
dic[attr[x]]=[[0 for i in range(c)] for j in range(counts[x])]
pos=0
for y in range(r):
if data[y][col]==attr[x]:
if delete:
del data[y][col]
dic[attr[x]][pos]=data[y]
pos+=1
return attr,dic
def entropy(S):
attr=list(set(S))
if len(attr)==1:
return 0
counts=[0,0]
for i in range(2):
counts[i]=sum([1 for x in S if attr[i]==x])/(len(S)*1.0)
sums=0
for cnt in counts:
sums+=-1*cnt*math.log(cnt,2)
return sums
def compute_gain(data,col):
attr,dic = subtables(data,col,delete=False)
total_size=len(data)
entropies=[0]*len(attr)
ratio=[0]*len(attr)
def build_tree(data,features):
lastcol=[row[-1] for row in data]
if(len(set(lastcol)))==1:
node=Node("")
node.answer=lastcol[0]
return node
n=len(data[0])-1
gains=[0]*n
for col in range(n):
gains[col]=compute_gain(data,col)
split=gains.index(max(gains))
node=Node(features[split])
fea = features[:split]+features[split+1:]
attr,dic=subtables(data,split,delete=True)
for x in range(len(attr)):
child=build_tree(dic[attr[x]],fea)
node.children.append((attr[x],child))
return node
def print_tree(node,level):
if node.answer!="":
print(" "*level,node.answer)
return
print(" "*level,node.attribute)
for value,n in node.children:
print(" "*(level+1),value)
print_tree(n,level+2)
def classify(node,x_test,features):
if node.answer!="":
print(node.answer)
return
pos=features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)
'''Main program'''
dataset,features=load_csv("4_id3.csv")
node1=build_tree(dataset,features)
print("The decision tree for the dataset using ID3 algorithm is")
print_tree(node1,0)
testdata,features=load_csv("4_id3_test_1.csv")
Program 5:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('5user_data.csv')
df
y_prediction = model.predict(X_test)
Program 3a:
import pandas as pd
from numpy import unique
from pandas import read_csv
# summarize the number of unique values for each column using numpy
# load the dataset
data = pd.read_csv('oil-spill.csv', header=None)
# summarize the number of unique values in each column
print(data.nunique())
# delete columns with a single unique value
# define the location of the dataset
print(data.shape)
# get number of unique values for each column
counts = data.nunique()
# record columns to delete
to_del = [i for i,v in enumerate(counts) if v == 1]
print(to_del)
# drop useless columns
data.drop(to_del, axis=1, inplace=True)
print(data.shape)
__________________________________________________________________
3b:
# summarize the number of unique values for each column using numpy
# load the dataset
data = pd.read_csv('iris.csv', header=None)
# calculate duplicates
dups = data.duplicated()
# report if there are any duplicates
print(dups.any())
# list all duplicate rows
print(data[dups])
print(data.shape)
# delete duplicate rows
data.drop_duplicates(inplace=True)
print(data.shape)
PROGRAM 6: Implement the naïve Bayesian classifier for a sample training data set stored as a
.CSV file. Compute the accuracy of the classifier, considering few test data sets.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
df = pd.read_csv("pima_indian.csv")
feature_col_names = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin',
'bmi', 'diab_pred', 'age']
predicted_class_names = ['diabetes']
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.33)
clf = GaussianNB().fit(xtrain,ytrain.ravel())
predicted = clf.predict(xtest)
predictTestData= clf.predict([[6,148,72,35,0,33.6,0.627,50]])
import pandas as pd
msg=pd.read_csv('naivetext.csv',names=['message','label'])
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
print(X)
print(y)
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_fe
ature_names())
import numpy as np
import pandas as pd
import csv
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.models import BayesianModel
from pgmpy.inference import VariableElimination
#heartDisease = pd.read_csv('heartdiseasedata.csv')
data=pd.read_csv("heartdisease.csv",
names=['age','Gender','Family','diet','Lifestyle','cholestrol','heartdisease'])
# heartDisease = heartDisease.replace('?',np.nan)
heartDisease=pd.DataFrame(data)
heartDisease.columns
lb = LabelEncoder()
for col in heartDisease.columns:
heartDisease[col] = lb.fit_transform(heartDisease[col])
model=BayesianModel([('age','heartdisease'),('Gender','heartdisease'),
('Family','heartdisease'),('diet','cholestrol'),('Lifestyle','diet'),
('heartdisease','cholestrol')])
print('\nLearning CPD using Maximum likelihood estimators')
model.fit(heartDisease,estimator=MaximumLikelihoodEstimator)
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
#print(X)
X.columns = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width']
#print(X.columns)
#print("X:",x)
#print("Y:",y)
y = pd.DataFrame(iris.target)
y.columns = ['Targets']
plt.figure(figsize=(14,7))
plt.subplot(1, 2, 1)
plt.scatter(X.Sepal_Length,X.Sepal_Width, c=colormap[y.Targets], s=40)
plt.title('Sepal')
plt.subplot(1, 2, 2)
plt.scatter(X.Petal_Length,X.Petal_Width, c=colormap[y.Targets], s=40)
plt.title('Petal')
scaler = preprocessing.StandardScaler()
scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
xs.sample(5)
print (xs.sample(5))
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
print (gmm.fit(xs))
y_cluster_gmm = gmm.predict(xs)
y_cluster_gmm
print(y_cluster_gmm)
plt.subplot(1, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width, c=colormap[y_cluster_gmm], s=40)
plt.title('GMM Classification')
print('Accuracy Score')
print(sm.accuracy_score(y, y_cluster_gmm))
import pandas as pd
dataset = pd.read_csv('iris.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state =
0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
kernel='linear', max_iter=-1, probability=False, random_state=0,
shrinking=True, tol=0.001, verbose=False)
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
from sklearn import metrics
print('Accuracy metrics')
print('Accuracy of the SVM is',metrics.accuracy_score(y_test,y_pred))
print('Confusion matrix')
print(metrics.confusion_matrix(y_test,y_pred))
print('Recall and Precison ')
print(metrics.recall_score(y_test,y_pred,average='weighted'))
print(metrics.precision_score(y_test,y_pred,average='weighted'))