From 9fc5ae344d6efc9a8582ad130dedb5a4e1dda13e Mon Sep 17 00:00:00 2001 From: richard lyman Date: Sun, 7 Feb 2021 14:45:13 -0800 Subject: [PATCH 1/2] Change to TenorFlow 2 from version 1 Adjust to changes in sklearn, theano, keras --- n0_network.py | 4 +- n1_2cnv1fc.py | 7 ++- n1_2cnv2fc.py | 6 ++- n1_baseTensorNN.py | 4 +- n1_image_to_image.py | 5 +- n1_residual3x4.py | 6 +-- o1_top_secret_cnn.py | 12 +++-- o2_top_secret_lda-tesseract.py | 4 +- o3_top_secret_python_box.py | 2 +- o4_image_to_image.py | 2 +- ocr_utils.py | 11 +++-- p115_l1_l2_regularization.py | 6 +-- p119_squential_backward_selection.py | 2 +- p131_principal_component_analysis.py | 6 +-- p141_linear_descriminant_analsys.py | 13 ++--- p154_pca_nonlinear_mapings.py | 5 +- p177_k_fold_cross_validation.py | 47 +++++++++--------- p181_learning_curves.py | 6 +-- p186_grid_search.py | 2 +- p189_nested_cross_validation.py | 4 +- p193_model_precision_recall.py | 4 +- p194_receiver_operating_characteristic.py | 58 +++++++++++++++-------- p206_majority_vote_classifier.py | 23 +++++---- p221_bagging_bootstrap_samples.py | 2 +- p229_adaboost.py | 2 +- p411_keras.py | 11 ++--- p51_standard_scalar.py | 5 +- p62_logistic_regression.py | 8 ++-- p73_support_vector_machine.py | 1 + q2_tensorflow_mnist.py | 3 +- q3_removing_affine_distortion.py | 4 +- q4_Theano_mlp.py | 1 + q6_tensorflow_residual3x4.py | 2 +- q8_tika.py | 1 + q9_tensorflow_gpu_test.py | 19 +++++--- 35 files changed, 171 insertions(+), 127 deletions(-) diff --git a/n0_network.py b/n0_network.py index f3aa381..c37a1bd 100644 --- a/n0_network.py +++ b/n0_network.py @@ -1,4 +1,6 @@ -import tensorflow as tf +#import tensorflow as tf +from tensorflow.compat import v1 as tf +tf.compat.v1.disable_eager_execution() import numpy as np from collections import namedtuple import datetime diff --git a/n1_2cnv1fc.py b/n1_2cnv1fc.py index 2b59ac2..845b6eb 100644 --- a/n1_2cnv1fc.py +++ b/n1_2cnv1fc.py @@ -1,4 +1,6 @@ -import tensorflow as tf +#import tensorflow as tf +from tensorflow.compat import v1 as tf +tf.compat.v1.disable_eager_execution() import numpy as np from collections import namedtuple import datetime @@ -43,6 +45,7 @@ def __init__(self, truthed_features, dtype=np.float32): nm = 'x_'+nm if i>1: extra_features_width += truthed_features.feature_width[i] + lst.append(tf.placeholder(dtype, shape=[None, truthed_features.feature_width[i]], name=nm)) # ph is a named tuple with key names like 'image', 'm_label', and values that @@ -218,7 +221,7 @@ def computeSize(s,tens): tShape = tens.get_shape() nDims = len(tShape) for i in range(nDims): - sumC *= tShape[i].value + sumC *= tShape[i] print ('\t{}\t{}'.format(s,sumC),flush=True) return sumC diff --git a/n1_2cnv2fc.py b/n1_2cnv2fc.py index 8865bb4..4e32471 100644 --- a/n1_2cnv2fc.py +++ b/n1_2cnv2fc.py @@ -1,4 +1,6 @@ -import tensorflow as tf +#import tensorflow as tf +from tensorflow.compat import v1 as tf +tf.compat.v1.disable_eager_execution() import numpy as np from collections import namedtuple import datetime @@ -250,7 +252,7 @@ def computeSize(s,tens): tShape = tens.get_shape() nDims = len(tShape) for i in range(nDims): - sumC *= tShape[i].value + sumC *= tShape[i] print ('\t{}\t{}'.format(s,sumC),flush=True) return sumC diff --git a/n1_baseTensorNN.py b/n1_baseTensorNN.py index 7870c5e..22a90e6 100644 --- a/n1_baseTensorNN.py +++ b/n1_baseTensorNN.py @@ -1,4 +1,6 @@ -import tensorflow as tf +from tensorflow.compat import v1 as tf +tf.compat.v1.disable_eager_execution() +#import tf import numpy as np from collections import namedtuple import datetime diff --git a/n1_image_to_image.py b/n1_image_to_image.py index e6b86c4..f391548 100644 --- a/n1_image_to_image.py +++ b/n1_image_to_image.py @@ -1,4 +1,5 @@ -import tensorflow as tf +from tensorflow.compat import v1 as tf +#import tf import numpy as np from collections import namedtuple import datetime @@ -258,7 +259,7 @@ def computeSize(s,tens): tShape = tens.get_shape() nDims = len(tShape) for i in range(nDims): - sumC *= tShape[i].value + sumC *= tShape[i] print ('\t{}\t{}'.format(s,sumC),flush=True) return sumC diff --git a/n1_residual3x4.py b/n1_residual3x4.py index 12569f3..3bf1cc6 100644 --- a/n1_residual3x4.py +++ b/n1_residual3x4.py @@ -7,7 +7,7 @@ ''' -import tensorflow as tf +from tensorflow.compat import v1 as tf import numpy as np from collections import namedtuple import datetime @@ -91,7 +91,7 @@ def bias_variable(shape, dtype): return tf.Variable(initial) def shapeOuts(n): - print ('n={}, hin={},w={}, b={} ,hout={}\n'.format(n, h[n]._shape, w[n]._variable._shape, b[n]._variable._shape, h[n+1]._shape)) + print ('n={}, hin={},w={}, b={} ,hout={}\n'.format(n, h[n].shape, w[n].shape, b[n].shape, h[n+1]._shape)) def section(n): with tf.name_scope('section_'+str(n)+'_0') as scope: @@ -122,7 +122,7 @@ def computeSize(s,tens): tShape = tens.get_shape() nDims = len(tShape) for i in range(nDims): - sumC *= tShape[i].value + sumC *= tShape[i] print ('\t{}\t{}'.format(s,sumC),flush=True) return sumC diff --git a/o1_top_secret_cnn.py b/o1_top_secret_cnn.py index 86e931a..afe5e5a 100644 --- a/o1_top_secret_cnn.py +++ b/o1_top_secret_cnn.py @@ -44,6 +44,10 @@ #import n1_2cnv1fc as nnetwork #import n1_residual3x4 as nnetwork import n1_2cnv2fc as nnetwork +import skimage.transform as af +from bitarray import bitarray + + input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))} output_feature_list = ['orientation_one_hot','image'] dtype = np.float32 @@ -70,7 +74,7 @@ # pick up the base characters from training_image_file -# produce some skeared versions +# produce some sheared versions # make into a training set # place in a ocr_utils TruthedCharacters class so we can use the # one hot and batch functions @@ -94,7 +98,7 @@ orientation=[] recognized_label =[] -import skimage.transform as af + for j in range(shp[0]): for i,skew in enumerate(skewRange): @@ -141,10 +145,10 @@ image_file_jpg = image_file+'.jpg' df,t1 = ocr_utils.file_to_df(image_file,character_size, title = 'unencrypted file',white_space=white_space) -from bitarray import bitarray + secret_message = "top secret" a = bitarray() -a.fromstring(secret_message) +a.frombytes(secret_message.encode('utf_8')) index = 0 encoded_skews=[] diff --git a/o2_top_secret_lda-tesseract.py b/o2_top_secret_lda-tesseract.py index 899c6ad..b7dcc36 100644 --- a/o2_top_secret_lda-tesseract.py +++ b/o2_top_secret_lda-tesseract.py @@ -12,6 +12,7 @@ from sklearn.linear_model import LogisticRegression from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA from sklearn.metrics import accuracy_score +from ruamel_yaml.compat import utf8 inputs = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghiklnopqrstuvwxyz' inputs_list = list(ord(x) for x in inputs) @@ -70,8 +71,7 @@ def encode_and_save_file(input_base, output_base, character_size, white_space, s from bitarray import bitarray a = bitarray() - a.fromstring(secret_message) - + a.frombytes(secret_message.encode('utf-8') ) index = 0 def convert_to_shear(a): diff --git a/o3_top_secret_python_box.py b/o3_top_secret_python_box.py index bffc071..c330dd1 100644 --- a/o3_top_secret_python_box.py +++ b/o3_top_secret_python_box.py @@ -73,7 +73,7 @@ #from sklearn.model_selection from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.linear_model import LogisticRegression -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split # input_filters_dict = {'m_label': list(range(48,58))+list(range(65,91))} # output_feature_list = ['orientation_one_hot','image'] diff --git a/o4_image_to_image.py b/o4_image_to_image.py index d54a19e..5cce174 100644 --- a/o4_image_to_image.py +++ b/o4_image_to_image.py @@ -30,7 +30,7 @@ import pandas as pd import n1_image_to_image as nnetwork #import n1_residual3x4 as nnetwork -import tensorflow as tf +from tensorflow.compat import v1 as tf dtype = np.float32 #with tf.device('/GPU:0'): #with tf.device('/cpu:0'): diff --git a/ocr_utils.py b/ocr_utils.py index 2791558..a130fe9 100644 --- a/ocr_utils.py +++ b/ocr_utils.py @@ -38,7 +38,8 @@ import numpy as np import pandas as pd import math -from pandas.io.common import ZipFile +#from pandas.io.common import ZipFile +from zipfile import ZipFile from matplotlib.colors import ListedColormap import matplotlib.pyplot as plt import sys @@ -1064,9 +1065,13 @@ def plot_decision_regions(X=None, y=None, classifier=None, resolution = .005, te plt.ylim(xx2.min()-d, xx2.max()+d) # plot class samples + for idx, cl in enumerate(np.unique(y)): - plt.scatter(X[y == cl, 0], X[y == cl, 1], - alpha=0.8, c=cmap(idx), + xs = X[y == cl, 0] + ys = X[y == cl, 1] + c =cmap(idx) + plt.scatter(xs, ys, + alpha=0.8, color=c, marker=markers[idx%len(markers)], label=cl) # highlight test samples diff --git a/p115_l1_l2_regularization.py b/p115_l1_l2_regularization.py index ff90e48..690ae2f 100644 --- a/p115_l1_l2_regularization.py +++ b/p115_l1_l2_regularization.py @@ -69,7 +69,7 @@ from sklearn.linear_model import LogisticRegression -lr = LogisticRegression(penalty='l1', C=0.1, random_state=0) +lr = LogisticRegression(penalty='l1', C=0.1, random_state=0, solver='liblinear',multi_class='auto') lr.fit(X_train_std, y_train) print('Training accuracy-l1 regularization:', lr.score(X_train_std, y_train)) print('Test accuracy-l1 regularization:', lr.score(X_test_std, y_test)) @@ -79,7 +79,7 @@ print('\t{}'.format(lr.coef_)) -lr = LogisticRegression(penalty='l2', C=0.1, random_state=0) +lr = LogisticRegression(penalty='l2', C=0.1, random_state=0, solver='liblinear',multi_class='auto') lr.fit(X_train_std, y_train) print('Training accuracy-l2 regularization:', lr.score(X_train_std, y_train)) print('Test accuracy-l2 regularization:', lr.score(X_test_std, y_test)) @@ -101,7 +101,7 @@ def weight_graph(regularization = 'l1'): weights, params = [], [] for c in np.arange(0, 6): - lr = LogisticRegression(penalty=regularization, C=10**c, random_state=0) + lr = LogisticRegression(penalty=regularization, C=10**c, random_state=0, solver='liblinear',multi_class='auto') lr.fit(X_train_std, y_train) weights.append(lr.coef_[1]) params.append(10**c) diff --git a/p119_squential_backward_selection.py b/p119_squential_backward_selection.py index 2143d7c..d226961 100644 --- a/p119_squential_backward_selection.py +++ b/p119_squential_backward_selection.py @@ -49,7 +49,7 @@ y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=range(0,20), nChars=1000, random_state=0) -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=0) diff --git a/p131_principal_component_analysis.py b/p131_principal_component_analysis.py index 8a379b5..c4b8a00 100644 --- a/p131_principal_component_analysis.py +++ b/p131_principal_component_analysis.py @@ -130,7 +130,7 @@ X_train_pca = pca.fit_transform(X_train_image) X_test_pca = pca.transform(X_test_image) -lr = LogisticRegression() +lr = LogisticRegression(solver='liblinear',multi_class='auto') logistic_fitted =lr.fit(X_train_pca, y_train) print('\nPCA Train Accuracy: {:4.6f}, n_components={}'.format(accuracy_score(y_train, logistic_fitted.predict(X_train_pca)),pca.n_components)) @@ -149,7 +149,7 @@ X_train_pca = pca.fit_transform(X_train_image) X_test_pca = pca.transform(X_test_image) -lr = LogisticRegression() +lr = LogisticRegression(solver='liblinear',multi_class='auto') logistic_fitted = lr.fit(X_train_pca, y_train) y_train_pred = logistic_fitted.predict(X_train_pca) @@ -191,7 +191,7 @@ X_train_pca = pca.fit_transform(X_train_image) X_test_pca = pca.transform(X_test_image) -lr = LogisticRegression() +lr = LogisticRegression(solver='liblinear',multi_class='auto') logistic_fitted=lr.fit(X_train_pca, y_train) y_train_pred = logistic_fitted.predict(X_train_pca) y_test_pred = logistic_fitted.predict(X_test_pca) diff --git a/p141_linear_descriminant_analsys.py b/p141_linear_descriminant_analsys.py index 3e87962..1d50af8 100644 --- a/p141_linear_descriminant_analsys.py +++ b/p141_linear_descriminant_analsys.py @@ -61,6 +61,7 @@ from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA print_limit = 20 chars_to_train = range(48,58) +n_classes = len(chars_to_train) columnsXY=range(0,20) column_str = 'column_sum{}'.format(list(columnsXY)) @@ -107,7 +108,7 @@ S_W = np.zeros((d, d)) for label, mv in zip(unique_labels, mean_vecs): class_scatter = np.zeros((d, d)) - for row in X_train_std[[y_train == label]]: + for row in X_train_std[y_train == label]: row, mv = row.reshape(d, 1), mv.reshape(d, 1) class_scatter += (row-mv).dot((row-mv).T) S_W += class_scatter @@ -195,7 +196,7 @@ X_test_lda = lda.transform(X_test_std) from sklearn.linear_model import LogisticRegression -lr = LogisticRegression() +lr = LogisticRegression(solver='liblinear', multi_class='auto') lr = lr.fit(X_train_lda, y_train) title = 'Linear Descriminant Analysis Training Set' @@ -208,13 +209,13 @@ ############################################################################### n_components = 10 -lda = LDA(n_components=n_components) +lda = LDA(n_components=min(n_components,n_classes-1)) X_train_lda = lda.fit_transform(X_train_std, y_train) X_test_lda = lda.transform(X_test_std) print ('n_components={}'.format(lda.n_components)) -lr = LogisticRegression() +lr = LogisticRegression(solver='liblinear', multi_class='auto') logistic_fitted = lr.fit(X_train_lda, y_train) from sklearn.metrics import accuracy_score @@ -233,13 +234,13 @@ ############################################################################### n_components = 10 -lda = LDA(n_components=n_components, solver='eigen') +lda = LDA(n_components=n_components-1, solver='eigen') X_train_lda = lda.fit_transform(X_train_std, y_train) X_test_lda = lda.transform(X_test_std) print ('n_components={}'.format(lda.n_components)) -lr = LogisticRegression() +lr = LogisticRegression(solver='liblinear', multi_class='auto') logistic_fitted = lr.fit(X_train_lda, y_train) from sklearn.metrics import accuracy_score diff --git a/p154_pca_nonlinear_mapings.py b/p154_pca_nonlinear_mapings.py index c7da287..3d92e9d 100644 --- a/p154_pca_nonlinear_mapings.py +++ b/p154_pca_nonlinear_mapings.py @@ -83,8 +83,7 @@ def rbf_kernel_pca1(X, gamma, n_components): eigvals, eigvecs = eigh(K) # Collect the top k eigenvectors (projected samples) - X_pc = np.column_stack((eigvecs[:, -i] - for i in range(1, n_components + 1))) + X_pc = np.column_stack([eigvecs[:, -i] for i in range(1, n_components + 1)]) return X_pc @@ -269,7 +268,7 @@ def rbf_kernel_pca(X, gamma, n_components): eigvals, eigvecs = eigh(K) # Collect the top k eigenvectors (projected samples) - alphas = np.column_stack((eigvecs[:,-i] for i in range(1,n_components+1))) + alphas = np.column_stack([eigvecs[:,-i] for i in range(1,n_components+1)]) # Collect the corresponding eigenvalues lambdas = [eigvals[-i] for i in range(1,n_components+1)] diff --git a/p177_k_fold_cross_validation.py b/p177_k_fold_cross_validation.py index c6bf4b5..6a32210 100644 --- a/p177_k_fold_cross_validation.py +++ b/p177_k_fold_cross_validation.py @@ -1,4 +1,4 @@ -'''k_fold_cross_validation.py +'''k_fold_model_selection.py k fold cross validation splits the training set into n parts and uses a different 1/n of the test set for each iteration. It is good for tuning parameters as all samples are used, reducing the variance of the @@ -48,12 +48,13 @@ import ocr_utils import matplotlib.pyplot as plt import numpy as np -from sklearn.cross_validation import StratifiedKFold +from sklearn.model_selection import StratifiedKFold from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA if __name__ == '__main__': #charsToTrain=range(48,58) chars_to_train = range(48,58) + n_classes = len(chars_to_train) num_chars = 3000 #limit the number to speed up the calculation @@ -75,7 +76,7 @@ # y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = charsToTrain , columns=range(0,20), nChars=1000, test_size=0.3,random_state=0) from sklearn.linear_model import LogisticRegression - from sklearn.cross_validation import train_test_split + from sklearn.model_selection import train_test_split X_train , X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=0) @@ -99,28 +100,24 @@ for num_PCA in num_planes: print ('number of Principal Components = {}'.format(num_PCA)) pipe_lr = Pipeline([('scl', StandardScaler()), - ('pca', PCA(n_components=num_PCA)), - ('clf', LogisticRegression(random_state=1))]) - + ('pca', PCA(n_components=num_PCA, svd_solver='full')), + ('clf', LogisticRegression(random_state=1,multi_class='auto', solver='liblinear'))]) + pipe_lr.fit(X_train, y_train) print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test)) - - - kfold = StratifiedKFold(y=y_train, - n_folds=10, - random_state=1) + kfold = StratifiedKFold(n_splits=10, random_state=1) scores = [] - for k, (train, test) in enumerate(kfold): - pipe_lr.fit(X_train[train], y_train[train]) - score = pipe_lr.score(X_train[test], y_train[test]) + for train_index, test_index in kfold.split(X_train, y_train): + pipe_lr.fit(X_train[train_index], y_train[train_index]) + score = pipe_lr.score(X_train[test_index], y_train[test_index]) scores.append(score) #print ('train {} samples: {}'.format(len(train), train)) #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score)) print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) - from sklearn.cross_validation import cross_val_score + from sklearn.model_selection import cross_val_score scores = cross_val_score(estimator=pipe_lr, X=X_train, @@ -153,21 +150,21 @@ for num_LDA in num_planes: print ('number of Principal Components = {}'.format(num_LDA)) pipe_lr = Pipeline([('scl', StandardScaler()), - ('lda', LDA(n_components=num_LDA)), - ('clf', LogisticRegression(random_state=1))]) - + ('lda', LDA(n_components=min(num_LDA,n_classes-1), solver='eigen')), + ('clf', LogisticRegression(random_state=1,multi_class='auto',solver='liblinear'))]) + + kys = pipe_lr.get_params().keys() + print(kys) +# pipe_lr.set_params(lda__solver='eigen',clf__solver='liblinear',clf__multi_class='auto') pipe_lr.fit(X_train, y_train) print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test)) - - kfold = StratifiedKFold(y=y_train, - n_folds=10, - random_state=1) + kfold = StratifiedKFold(n_splits=10, random_state=1) scores = [] - for k, (train, test) in enumerate(kfold): - pipe_lr.fit(X_train[train], y_train[train]) - score = pipe_lr.score(X_train[test], y_train[test]) + for train_index, test_index in kfold.split(X_train, y_train): + pipe_lr.fit(X_train[train_index], y_train[train_index]) + score = pipe_lr.score(X_train[test_index], y_train[test_index]) scores.append(score) #print ('train {} samples: {}'.format(len(train), train)) #print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train])[list(charsToTrain)], score)) diff --git a/p181_learning_curves.py b/p181_learning_curves.py index 281587e..811a9ef 100644 --- a/p181_learning_curves.py +++ b/p181_learning_curves.py @@ -42,7 +42,7 @@ @author: richard lyman ''' import matplotlib.pyplot as plt -from sklearn.learning_curve import learning_curve +from sklearn.model_selection import learning_curve import numpy as np import ocr_utils from sklearn.preprocessing import StandardScaler @@ -55,7 +55,7 @@ y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) pipe_lr = Pipeline([('scl', StandardScaler()), - ('clf', LogisticRegression(penalty='l2', random_state=0))]) + ('clf', LogisticRegression(penalty='l2', random_state=0, solver='lbfgs'))]) train_sizes, train_scores, test_scores =\ learning_curve(estimator=pipe_lr, @@ -99,7 +99,7 @@ plt.tight_layout() ocr_utils.show_figures(plt,title) - from sklearn.learning_curve import validation_curve + from sklearn.model_selection import validation_curve param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve( diff --git a/p186_grid_search.py b/p186_grid_search.py index 344b807..cb2f00a 100644 --- a/p186_grid_search.py +++ b/p186_grid_search.py @@ -35,7 +35,7 @@ import ocr_utils from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC if __name__ == '__main__': diff --git a/p189_nested_cross_validation.py b/p189_nested_cross_validation.py index 694f0c4..17d55e6 100644 --- a/p189_nested_cross_validation.py +++ b/p189_nested_cross_validation.py @@ -55,8 +55,8 @@ import ocr_utils from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline -from sklearn.cross_validation import cross_val_score -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import cross_val_score +from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC if __name__ == '__main__': y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , test_size=0.3, columns=(9,17), random_state=0) diff --git a/p193_model_precision_recall.py b/p193_model_precision_recall.py index 25e8766..5413066 100644 --- a/p193_model_precision_recall.py +++ b/p193_model_precision_recall.py @@ -49,9 +49,9 @@ from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.svm import SVC -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from sklearn.metrics import make_scorer,precision_score, recall_score, f1_score -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split if __name__ == '__main__': y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,51) , columns=(9,17), random_state=0) diff --git a/p194_receiver_operating_characteristic.py b/p194_receiver_operating_characteristic.py index 44d69a0..5bc308f 100644 --- a/p194_receiver_operating_characteristic.py +++ b/p194_receiver_operating_characteristic.py @@ -45,10 +45,12 @@ from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline -from sklearn.cross_validation import StratifiedKFold +from sklearn.model_selection import StratifiedKFold from sklearn.decomposition import PCA -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.metrics import make_scorer,precision_score,roc_curve, auc +from sklearn.metrics import roc_auc_score, accuracy_score +from sklearn.model_selection import cross_val_score if __name__ == '__main__': @@ -64,32 +66,48 @@ pipe_lr = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), - ('clf', LogisticRegression(penalty='l2',random_state=0,C=100.0))]) + ('clf', LogisticRegression(penalty='l2',random_state=0,C=100.0, solver='lbfgs'))]) # X_train2 = X_train[:, [4, 14]] X_train2 = X_train - cv = StratifiedKFold(y_train,n_folds=3,random_state=1) + + kfold = StratifiedKFold(n_splits=3, random_state=1) + +# scores = [] +# for train_index, test_index in kfold.split(X_train, y_train): +# pipe_lr.fit(X_train[train_index], y_train[train_index]) +# score = pipe_lr.score(X_train[test_index], y_train[test_index]) +# scores.append(score) + + + + + + + +# cv = StratifiedKFold(y_train,n_folds=3,random_state=1) fig = plt.figure(figsize=(7, 5)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] - - for i, (train, test) in enumerate(cv): - probas = pipe_lr.fit(X_train2[train], - y_train[train]).predict_proba(X_train2[test]) + i=0 + for train_index, test_index in kfold.split(X_train, y_train): + probas = pipe_lr.fit(X_train2[train_index], + y_train[train_index]).predict_proba(X_train2[test_index]) - fpr, tpr, thresholds = roc_curve(y_train[test], + fpr, tpr, thresholds = roc_curve(y_train[test_index], probas[:, 1], pos_label=1) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) + i=i+1 plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' - % (i+1, roc_auc)) + % (i, roc_auc)) plt.plot([0, 1], [0, 1], @@ -97,7 +115,7 @@ color=(0.6, 0.6, 0.6), label='random guessing') - mean_tpr /= len(cv) + mean_tpr /= kfold.get_n_splits(X_train) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', @@ -123,8 +141,6 @@ pipe_lr = pipe_lr.fit(X_train2, y_train) # y_pred2 = pipe_lr.predict(X_test[:, [4, 14]]) y_pred2 = pipe_lr.predict(X_test) - from sklearn.metrics import roc_auc_score, accuracy_score - print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_pred2)) print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred2)) @@ -133,10 +149,10 @@ # for more than 2 classes for GridSearch # i.e. applies a binary scoring technique to multiclasses pos_label=range(48,58) - pre_scorer = make_scorer(score_func=precision_score, - pos_label=pos_label, - greater_is_better=True, - average='micro') +# pre_scorer = make_scorer(score_func=precision_score, +# pos_label=pos_label, +# greater_is_better=True, +# average='micro') from sklearn.svm import SVC y_train, X_train, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = pos_label , nChars=4000, columns=(9,17), random_state=0) @@ -149,15 +165,15 @@ {'clf__C': c_gamma_range, 'clf__gamma': c_gamma_range, 'clf__kernel': ['rbf'],}] - from sklearn.grid_search import GridSearchCV + from sklearn.model_selection import GridSearchCV gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, - scoring=pre_scorer, + scoring='accuracy', cv=5, n_jobs=-1) - from sklearn.cross_validation import cross_val_score - scores = cross_val_score(gs, X_train, y_train, scoring=pre_scorer, cv=5) + + scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print('\nSupport Vector Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) gs = gs.fit(X_train, y_train) diff --git a/p206_majority_vote_classifier.py b/p206_majority_vote_classifier.py index 221dc52..e404fa7 100644 --- a/p206_majority_vote_classifier.py +++ b/p206_majority_vote_classifier.py @@ -44,13 +44,12 @@ from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin from sklearn.preprocessing import LabelEncoder -from sklearn.externals import six from sklearn.base import clone from sklearn.pipeline import _name_estimators import numpy as np import ocr_utils -from sklearn.cross_validation import train_test_split -from sklearn.cross_validation import cross_val_score +from sklearn.model_selection import train_test_split +from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier @@ -59,6 +58,7 @@ from sklearn.metrics import roc_curve from sklearn.metrics import auc import matplotlib.pyplot as plt +from sklearn.model_selection import GridSearchCV class MajorityVoteClassifier(BaseEstimator, ClassifierMixin): @@ -182,8 +182,8 @@ def get_params(self, deep=True): return super(MajorityVoteClassifier, self).get_params(deep=False) else: out = self.named_classifiers.copy() - for name, step in six.iteritems(self.named_classifiers): - for key, value in six.iteritems(step.get_params(deep=True)): + for name, step in self.named_classifiers.items(): + for key, value in step.get_params(deep=True).items(): out['%s__%s' % (name, key)] = value return out @@ -204,7 +204,7 @@ def get_params(self, deep=True): clf1 = LogisticRegression(penalty='l2', C=0.001, - random_state=0) + random_state=0, solver='lbfgs') clf2 = DecisionTreeClassifier(max_depth=1, criterion='entropy', @@ -216,6 +216,7 @@ def get_params(self, deep=True): pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]]) +kys = pipe1.get_params() pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]]) @@ -314,7 +315,7 @@ def get_params(self, deep=True): pprint.pprint(mv_clf.get_params()) print() -from sklearn.grid_search import GridSearchCV + params = {'decisiontreeclassifier__max_depth': [1, 2], 'pipeline-1__clf__C': [0.001, 0.1, 100.0]} @@ -325,9 +326,13 @@ def get_params(self, deep=True): scoring='roc_auc') grid.fit(X_train, y_train) -for params, mean_score, scores in grid.grid_scores_: +params=grid.cv_results_['params'] +mean_scores=grid.cv_results_['mean_test_score'] +scores = grid.cv_results_['std_test_score'] + +for i in range(len(params)): print("%0.6f+/-%0.6f %r" - % (mean_score, scores.std() / 2, sorted(params.items()))) + % (mean_scores[i], scores[i] / 2, sorted(params[i].items()))) print('\nBest parameters: %s' % sorted(grid.best_params_.items())) print('Best Accuracy: %.6f' % grid.best_score_) diff --git a/p221_bagging_bootstrap_samples.py b/p221_bagging_bootstrap_samples.py index 1f0a945..cc34df9 100644 --- a/p221_bagging_bootstrap_samples.py +++ b/p221_bagging_bootstrap_samples.py @@ -35,7 +35,7 @@ from sklearn.preprocessing import LabelEncoder import ocr_utils -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt if __name__ == '__main__': diff --git a/p229_adaboost.py b/p229_adaboost.py index cf97b71..f4852f0 100644 --- a/p229_adaboost.py +++ b/p229_adaboost.py @@ -41,7 +41,7 @@ from sklearn.ensemble import AdaBoostClassifier import ocr_utils -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder diff --git a/p411_keras.py b/p411_keras.py index 823dd3a..d5216ba 100644 --- a/p411_keras.py +++ b/p411_keras.py @@ -50,25 +50,22 @@ def do_keras(X_train,X_test, y_train_ohe, y_train,y_test): model = Sequential() model.add(Dense(input_dim=X_train.shape[1], - output_dim=50, - init='uniform', + units=50, activation='tanh')) model.add(Dense(input_dim=50, - output_dim=50, - init='uniform', + units=50, activation='tanh')) model.add(Dense(input_dim=50, - output_dim=y_train_ohe.shape[1], - init='uniform', + units=y_train_ohe.shape[1], activation='softmax')) sgd = SGD(lr=0.001, decay=1e-7, momentum=.9) model.compile(loss='categorical_crossentropy', optimizer=sgd,metrics=["accuracy"]) model.fit(X_train, y_train_ohe, - nb_epoch=50, + epochs=50, batch_size=300, verbose=2, validation_split=0.1 diff --git a/p51_standard_scalar.py b/p51_standard_scalar.py index 1a8cca4..89e8014 100644 --- a/p51_standard_scalar.py +++ b/p51_standard_scalar.py @@ -38,6 +38,7 @@ import ocr_utils from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Perceptron +from sklearn.model_selection import train_test_split ############################################################################# @@ -51,7 +52,7 @@ print('Class labels:', np.unique(y)) -from sklearn.cross_validation import train_test_split + ############################################################################# # standardize the features @@ -63,7 +64,7 @@ X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) -ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0) +ppn = Perceptron(max_iter=40, eta0=0.1, random_state=0) ppn.fit(X_train_std, y_train) y_pred = ppn.predict(X_test_std) diff --git a/p62_logistic_regression.py b/p62_logistic_regression.py index e92e491..6333395 100644 --- a/p62_logistic_regression.py +++ b/p62_logistic_regression.py @@ -56,7 +56,7 @@ import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split y, X, y_test, X_test, labels = ocr_utils.load_E13B(chars_to_train = (48,49,50) , columns=(9,17),nChars=500) @@ -108,7 +108,7 @@ def cost_0(z): sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) -lr = LogisticRegression(C=1000.0, random_state=0) +lr = LogisticRegression(C=1000.0, random_state=0, solver='lbfgs',multi_class='auto') lr.fit(X_train_std, y_train) X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) @@ -123,7 +123,7 @@ def cost_0(z): weights, params = [], [] for c in np.arange(0, 5): - lr = LogisticRegression(C=10**c, random_state=0) + lr = LogisticRegression(C=10**c, random_state=0, solver='lbfgs',multi_class='auto') lr.fit(X_train_std, y_train) weights.append(lr.coef_[0]) params.append(10**c) @@ -132,7 +132,7 @@ def cost_0(z): title = 'regression_path' weights, params = [], [] for c in np.arange(0, 5): - lr = LogisticRegression(C=10**c, random_state=0) + lr = LogisticRegression(C=10**c, random_state=0, solver='lbfgs',multi_class='auto') lr.fit(X_train_std, y_train) weights.append(lr.coef_[1]) params.append(10**c) diff --git a/p73_support_vector_machine.py b/p73_support_vector_machine.py index 4643414..27a03b3 100644 --- a/p73_support_vector_machine.py +++ b/p73_support_vector_machine.py @@ -51,6 +51,7 @@ svm = SVC(kernel='linear', C=1.0, random_state=0) svm.fit(X_train_std, y_train) + ocr_utils.plot_decision_regions(X=X_combined_std, y=y_combined, classifier=svm, diff --git a/q2_tensorflow_mnist.py b/q2_tensorflow_mnist.py index fa9ff46..a5493da 100644 --- a/q2_tensorflow_mnist.py +++ b/q2_tensorflow_mnist.py @@ -33,7 +33,8 @@ import numpy as np import pandas as pd import n1_2cnv1fc as nnetwork -import tensorflow as tf +from tensorflow.compat import v1 as tf +#import tf dtype = np.float32 if True: diff --git a/q3_removing_affine_distortion.py b/q3_removing_affine_distortion.py index 85d5f37..93b7855 100644 --- a/q3_removing_affine_distortion.py +++ b/q3_removing_affine_distortion.py @@ -93,7 +93,7 @@ def shear(X, skew): # run a Logistic Regression on the raw features with 20 rows, 20 columns from sklearn.linear_model import LogisticRegression -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split X_train , X_test, y_train, y_test = train_test_split(images_reshaped, ys, test_size=0.3, random_state=0) @@ -146,7 +146,7 @@ def shear(X, skew): ######################################################################### # run Linear Discriminant Analysis first then Logistic Regression -from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA n_components = 2 lda = LDA(n_components=n_components) diff --git a/q4_Theano_mlp.py b/q4_Theano_mlp.py index f0278c7..caed736 100644 --- a/q4_Theano_mlp.py +++ b/q4_Theano_mlp.py @@ -20,6 +20,7 @@ import time import numpy as np import theano +# must comment out downsampling in pool.py because pip install did not include downsample import theano.tensor as T import lasagne import ocr_utils diff --git a/q6_tensorflow_residual3x4.py b/q6_tensorflow_residual3x4.py index 951ff47..d38cb8f 100644 --- a/q6_tensorflow_residual3x4.py +++ b/q6_tensorflow_residual3x4.py @@ -29,7 +29,7 @@ import numpy as np import pandas as pd import n1_residual3x4 as nnetwork -import tensorflow as tf +from tensorflow.compat import v1 as tf dtype = np.float32 #with tf.device('/GPU:0'): #with tf.device('/cpu:0'): diff --git a/q8_tika.py b/q8_tika.py index b1c7f93..1cc29f7 100644 --- a/q8_tika.py +++ b/q8_tika.py @@ -28,6 +28,7 @@ #os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client. #os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation. #os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path. +#os.putenv('TESSDATA_PREFIX','/usr/share/tesseract-ocr/4.00/tessdata/') tika.initVM() from tika import parser parsed = parser.from_buffer("comme çi comme ça") diff --git a/q9_tensorflow_gpu_test.py b/q9_tensorflow_gpu_test.py index 84a255b..d5725ce 100644 --- a/q9_tensorflow_gpu_test.py +++ b/q9_tensorflow_gpu_test.py @@ -1,17 +1,22 @@ # Creates a graph. import tensorflow as tf +#from tensorflow.compat import v1 as tf +#sess = tf.InteractiveSession() +@tf.function +def d(a,b): + return tf.matmul(a, b) a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a') b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b') -c = tf.matmul(a, b) +#c = tf.matmul(a, b) # Creates a session with log_device_placement set to True. -sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) -# Runs the op. - -tens1 = tf.constant([ [[1,2],[2,3]], [[3,4],[5,6]] ]) -print (sess.run(tens1)[1,1,0]) -print (sess.run(c)) +# Runs the op. +# tens1 = tf.constant([ [[1,2],[2,3]], [[3,4],[5,6]] ]) +# print (sess.run(tens1)[1,1,0]) +# self._sess.run(tf.initialize_all_variables()) +for i in range(100000): + d(a,b) print ('\n########################### No Errors ####################################') \ No newline at end of file From fedce1fcdc5e924bac3f279ccad902fec52dfa88 Mon Sep 17 00:00:00 2001 From: richard lyman Date: Fri, 12 Feb 2021 07:36:36 -0800 Subject: [PATCH 2/2] Show versions of installation --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b92b17c..7454fb0 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,19 @@ run_batch.sc is a bash script that runs all of the programs in the directory The script file will create many files in the folder, /tmp/plots. -Python 3.4 +Python 3.8 Anaconda3 Linux or Windows +cuda 11.2.1 +h5py 2.10.0 +Keras 2.4.3 +Lasagne 0.1 +matplotlib 3.3.2 +numpy 1.19.2 +pytesseract 0.3.7 +sklearn 0.0 +tensorflow 2.4.1 +tesseract 4.1.1 +Theano 1.0.5 +tika 1.24 +