機械学習Sample

手書き文字
データ準備
分類器
学習
学習結果
予測
分類器変更
pkl保存
my_img推論
BostonHouse
データ準備
WineQuality

構成・方式など
タスク
導入
Sample

用語 sampleなどはCentOS7で実施
$ python3
$ python3 xxxx.py

手書き文字
・手書き文字の準備（学習データと教師データをロード）
グレイスケールの手書き数字の画像データ1797個分をが行列形式で準備
データセットは8x8の画像が1797枚
それぞれに0〜9のラベルがついている。
images 　　　　>>> from sklearn.datasets import load_digits # 8x8の0から9の手書き文字　　　　>>> images = load_digits() 　　　　>>> data = images.data # 学習データ　　　　>>> target = images.target # 教師データ（ラベル） ・トレーニング用とテスト用のデータを準備
全体のデータの2割を検証用に使用　　　　>>> from sklearn import model_selection 　　　　>>> X_train, X_test, y_train, y_test = 　　　　 model_selection.train_test_split(data, target, test_size=0.2, random_state=0) 　　　　>>> print(X_train.shape) # フォーマット　　　　 (1437, 64) 　　　　>>> print(y_train.shape) 　　　　 (1437,) 　　　　>>> print(X_test.shape) 　　　　 (360, 64) 　　　　>>> print(y_test.shape) 　　　　 (360,) 　　　　>>> print(X_train) # データ　　　　[[ 0. 0. 0. ... 16. 16. 6.] 　　　　 [ 0. 3. 12. ... 16. 2. 0.] 　　　　 [ 0. 1. 10. ... 0. 0. 0.] 　　　　 ... 　　　　 [ 0. 0. 5. ... 0. 0. 0.] 　　　　 [ 0. 0. 4. ... 0. 0. 0.] 　　　　 [ 0. 0. 6. ... 11. 0. 0.]] 　　　　>>> print(y_train) 　　　　[6 5 3 ... 7 7 8] 　　　　>>> print(len(y_test)) # データ数　　　　360 ・判別器（分類器、識別器）にSVMを使用、X_train、y_trainで学習させて、X_testのラベルを予測
gammaとCの2個ののハイパーパラメータを指定
clf：classifier（分類器）　　　　>>> from sklearn import svm # SVMインポート　　　　>>> clf = svm.SVC(gamma=0.001, C=100.) # γ：大きいほど境界が複雑、C：誤分類の許容量 ・学習
fit：fitted 　　　　>>> clf.fit(X_train,y_train) ・学習結果（classifier.fit(train_X,train_y)）　　　　SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, 　　　　 decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf', 　　　　 max_iter=-1, probability=False, random_state=None, shrinking=True, 　　　　 tol=0.001, verbose=False) ・予測
pred：predicted（予測した）
ラスト20のチェック　　　　>>> pred_test = clf.predict(X_test[-20:]) # 予測ラベル　　　　>>> print(pred_test) 　　　　[5 1 6 4 5 0 9 4 1 1 7 0 8 9 0 5 4 3 8 8] 　　　　>>> print(y_test[-20:]) # 正解ラベル　　　　[5 1 6 4 5 0 9 4 1 1 7 0 8 9 0 5 4 3 8 8] 全体　　　　>>> from sklearn.metrics import accuracy_score 　　　　>>> pred_test = clf.predict(X_test) 　　　　>>> accuracy_test = accuracy_score(y_test,pred_test) 　　　　>>> print(accuracy_test) 　　　　0.9916666666666667 ・分類器を「svm.LinearSVC()」に　　　　>>> clf = svm.LinearSVC() 　　　　>>> clf.fit(X_train,y_train) 　　　　>>> pred_test = clf.predict(X_test) 　　　　>>> accuracy_test = accuracy_score(y_test,pred_test) 　　　　>>> print(accuracy_test) 　　　　0.9361111111111111 ・学習済みデータをPickle形式で保存　　　　>>> data = images.data 　　　　>>> target = images.target 　　　　>>> from sklearn import model_selection 　　　　>>> X_train, X_test, y_train, y_test = 　　　　 model_selection.train_test_split(data, target, test_size=0.2, random_state=0) 　　　　>>> from sklearn import svm 　　　　>>> clf = svm.SVC(gamma=0.001, C=100.) 　　　　>>> clf.fit(X_train,y_train) 　　　　SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, 　　　　 decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf', 　　　　 max_iter=-1, probability=False, random_state=None, shrinking=True, 　　　　 tol=0.001, verbose=False) 　　　　>>> save_file = 'digits.pkl' # 拡張子、pkl 　　　　>>> with open(save_file, 'wb') as f: 　　　　... pickle.dump(clf, save_file, f, -1) # -1、最も高いプロトコルバージョンで保存　　　　... print(clf) 　　　　... 　　　　SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0, 　　　　 decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf', 　　　　 max_iter=-1, probability=False, random_state=None, shrinking=True, 　　　　 tol=0.001, verbose=False) 　　　　>>> ・自分で書いたmy_img（手書きの数字）文字をテスト（推論）
「PIL、numpy」を使用、「skimage、io」でもよい
xxxx.py
#!/usr/bin/env python3
#
from PIL import Image
import numpy as np
import pickle
#my_img = np.array(Image.open('9.png'))
my_img = np.array(Image.open('9.png').convert('L')) # 2次元で取り込み
print("my_img =", type(my_img))
print("dtype =", my_img.dtype)
print("shape =", my_img.shape) # 大きい場合はresize
Image.fromarray(my_img).save('9_1.png')
#my_img = 15 - my_img // 16
my_img = my_img.reshape([-1,64])
file = open('digits.pkl', 'rb') Pickle形式で保存済
clf = pickle.load(file)
res = clf.predict(my_img)
print("my_img = " + str(res[0]))
print("End")
実行結果　　　　my_img = ＜class 'numpy.ndarray'＞　　　　dtype = uint8 　　　　shape = (8, 8) 　　　　my_img = 2 　　　　End ・sklearn.metricsを使って検証　　　　>>> from sklearn.metrics import confusion_matrix 　　　　>>> predicted = fitted.predict(X_test) 　　　　>>> metrics.confusion_matrix(predicted, y_test) 　　　　array([[27, 0, 0, 0, 0, 0, 0, 0, 0, 0], 　　　　 [ 0, 35, 0, 0, 0, 0, 0, 0, 0, 0], 　　　　 [ 0, 0, 36, 0, 0, 0, 0, 0, 0, 0], 　　　　 [ 0, 0, 0, 29, 0, 0, 0, 0, 0, 0], 　　　　 [ 0, 0, 0, 0, 30, 0, 0, 0, 0, 0], 　　　　 [ 0, 0, 0, 0, 0, 40, 0, 0, 0, 0], 　　　　 [ 0, 0, 0, 0, 0, 0, 44, 0, 0, 0], 　　　　 [ 0, 0, 0, 0, 0, 0, 0, 39, 0, 0], 　　　　 [ 0, 0, 0, 0, 0, 0, 0, 0, 39, 0], 　　　　 [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 41]])
boston_house_prices
・$ python3 （重回帰分析）　　　　>>> import pandas as pd 　　　　>>> import numpy as np 　　　　>>> import matplotlib.pyplot as plt 　　　　>>> from sklearn import datasets 　　　　>>> from sklearn import linear_model 　　　　>>> from sklearn import model_selection 　　　　>>> from sklearn.preprocessing import StandardScaler 　　　　>>> boston = datasets.load_boston() # ロード　　　　>>> data = boston.data 　　　　>>> target = boston.target 　　　　>>> print(data.shape) 　　　　(506, 13) 　　　　>>> print(target.shape) 　　　　(506,) ・トレーニング用とテスト用のデータを準備
全体のデータの2割をテスト証用、2割を検証用に　　　　>>> X_train, X_test, y_train, y_test = 　　　　 model_selection.train_test_split(data, target, test_size=0.2, random_state=114514) 　　　　>>> print(X_train.shape) # フォーマット　　　　 (404, 13) 　　　　>>> print(y_train.shape) 　　　　 (404,) 　　　　>>> print(X_test.shape) 　　　　 (102, 13) 　　　　>>> print(y_test.shape) 　　　　 (102,) 　　　　>>> print(X_train) # データ　　　　[[1.77800e-02 9.50000e+01 1.47000e+00 ... 1.70000e+01 3.84300e+02 　　　　 4.45000e+00] 　　　　 [2.29270e-01 0.00000e+00 6.91000e+00 ... 1.79000e+01 3.92740e+02 　　　　 1.88000e+01] 　　　　 [4.07710e-01 0.00000e+00 6.20000e+00 ... 1.74000e+01 3.95240e+02 　　　　 2.14600e+01] 　　　　 ... 　　　　 [4.22239e+00 0.00000e+00 1.81000e+01 ... 2.02000e+01 3.53040e+02 　　　　 1.46400e+01] 　　　　 [1.78667e+01 0.00000e+00 1.81000e+01 ... 2.02000e+01 3.93740e+02 　　　　 2.17800e+01] 　　　　 [3.75780e-01 0.00000e+00 1.05900e+01 ... 1.86000e+01 3.95240e+02 　　　　 2.39800e+01]] 　　　　>>> print(y_train) 　　　　[32.9 16.6 21.7 28.7 36. 22.6 16.7 16. 34.6 19.1 19.8 20.7 20.6 25.2 　　　　 27.1 50. 30.7 25. 18.7 26.5 27.9 23.9 17.5 23.1 22. 22.7 22.9 27.9 　　　　 50. 17.6 20.6 50. 45.4 17.5 35.2 20.1 21.8 6.3 18.8 20.9 9.6 21.4 　　　　 14.3 18.4 20.7 23.7 24.4 23.1 22.4 24.1 19. 18.4 20.4 7. 17.2 14. 　　　　 23.2 20.2 13.5 23.9 24.1 30.8 46.7 24.6 18.6 22.7 21.4 15.2 15.2 19.3 　　　　 8.1 24.3 34.9 14.4 23.1 23.6 28.7 26.4 13.1 13.9 23.1 50. 13.1 50. 　　　　 14.6 18.8 20.4 20.9 14.6 21.2 32.2 50. 26.2 5. 50. 13.9 23.1 12.6 　　　　 25. 10.8 21. 24.4 19.4 24.7 37.9 14.1 20.1 24.3 19. 19.1 19.4 11.8 　　　　 29.6 18.9 18.2 18.2 23.1 13.8 24.8 24.5 16.5 22.8 19.6 26.6 27.5 21.9 　　　　 20.8 5. 24.5 19.9 8.3 25.3 19.4 24.4 33.1 7. 24.8 31.5 15. 22.2 　　　　 13.1 22.6 34.9 17.1 21. 11.9 13.6 8.8 21.6 13.8 28. 15. 23.3 32.5 　　　　 21.4 30.1 23.7 41.3 18. 10.4 20.6 22.9 14.5 24.1 22. 50. 19.8 18.9 　　　　 29.6 32. 13.3 22.5 15.4 21.5 23.8 50. 13.5 22. 18.1 17. 50. 33.1 　　　　 21.7 18.2 10.2 22.2 24.5 34.7 33.2 23. 19.9 43.5 25. 23.6 42.8 16.3 　　　　 23.9 20.8 19.5 31.7 37.6 25. 19.7 16.1 14.2 21.2 17.1 36.2 20.3 8.7 　　　　 14.4 9.5 15.7 23.2 32.4 22.6 23. 48.3 50. 23.3 20.1 21.7 25.1 22.8 　　　　 33. 19.1 17.8 50. 16.1 15.6 14.9 15.6 17.5 27.5 17.8 13. 21.7 9.7 　　　　 25. 25. 38.7 31.2 19.5 36.4 32. 28.4 22.6 18.7 48.8 15.1 12.5 21.7 　　　　 23.5 30.1 24. 35.4 33.8 18.3 12.1 35.4 23.3 14.1 28.2 15.4 20.3 28.1 　　　　 26.6 21.2 31.6 41.7 10.5 23.7 12.7 19.6 17.4 20.4 11.5 23. 20.5 16.5 　　　　 25. 11.9 22.2 15.3 22.9 17.2 44. 21.5 11.7 24.6 17.8 50. 19.8 37.3 　　　　 18.6 22.8 13.8 24. 21.4 22.5 23.3 18.3 8.4 29. 12.7 15.6 19.1 20. 　　　　 19.4 29. 23.9 23.8 20.6 42.3 27.5 19.2 16.8 23.4 21.7 20.1 22.2 16.1 　　　　 30.1 32.7 23.7 19.3 18.5 22.4 11.8 21.2 22. 22.2 14.9 20.2 14.5 22.1 　　　　 19.3 13.8 28.4 17.3 29.8 24.7 21.7 28.5 18.4 19.5 18.5 35.1 22. 20.8 　　　　 37.2 17.4 10.5 23.2 17.8 24.8 21.2 27.5 14.3 23.8 18.7 46. 8.8 22.5 　　　　 16.2 29.1 10.4 11.7 24.7 5.6 21.1 20.4 17.9 21.8 39.8 31.1 13.3 20.3 　　　　 14.1 22.3 8.3 13.2 13.4 18.5 43.1 27. 31.6 12.3 18.9 33.3 8.5 17.2 　　　　 22. 26.7 24.8 19.6 11.3 29.1 17.8 20.5 20.5 16.8 10.2 19.3] ・$ python3 （SGD）　　　　>>> import pandas as pd 　　　　>>> import numpy as np 　　　　>>> import matplotlib.pyplot as plt 　　　　>>> from sklearn import datasets 　　　　>>> from sklearn import linear_model 　　　　>>> from sklearn.preprocessing import StandardScaler 　　　　>>> from sklearn.preprocessing import MinMaxScaler 　　　　>>> boston = datasets.load_boston() # ロード　　　　>>> print(boston) 　　　　>>> print(boston.DESCR) 　　　　>>> print(boston.data) 　　　　>>> print(boston.feature_names) 　　　　>>> print(boston.target) 　　　　>>> boston_df=pd.DataFrame(boston.data) # DataFrameセット　　　　>>> boston_df.columns = boston.feature_names 　　　　>>> boston_df['PRICE'] = pd.DataFrame(boston.target) 　　　　>>> boston_df.to_csv('boston_df.csv', sep=',', index=True, encoding='utf-8') # csv出力　　　　>>> boston_df.to_csv('boston.csv', sep=',', index=False, encoding='utf-8') 　　　　>>> X = boston_df.drop("PRICE", axis=1) # Xセット　　　　>>> mscaler = MinMaxScaler() 　　　　>>> mscaler.fit(X) # Xを正規化（Normalization）　　　　>>> X2 = mscaler.transform(X) 　　　　>>> X2 = pd.DataFrame(X2) 　　　　>>> X2.columns = boston.feature_names 　　　　>>> print("X2.describe() ->") 　　　　>>> print(X2.describe()) 　　　　>>> Y = boston_df.PRICE # Yセット　　　　>>> clf2_SGD = linear_model.SGDRegressor(max_iter=500) # SGDRegressorで分析　　　　>>> clf2_SGD.fit(X2, Y) # 事前に正規化済　　　　SGDRegressor(alpha=0.0001, average=False, early_stopping=False, epsilon=0.1, 　　　　 eta0=0.01, fit_intercept=True, l1_ratio=0.15, 　　　　 learning_rate='invscaling', loss='squared_loss', max_iter=500, 　　　　 n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None, 　　　　 shuffle=True, tol=0.001, validation_fraction=0.1, verbose=0, 　　　　 warm_start=False) 　　　　>>> print("### out put from X2 data and used SGD ###############") 　　　　>>> print(pd.DataFrame({"Name":X2.columns, "Coefficients":clf2_SGD.coef_}).sort_values(by='Coefficients') ) # 係数でソート　　　　 Name Coefficients 　　　　12 LSTAT -17.527713 　　　　10 PTRATIO -7.573122 　　　　7 DIS -7.196724 　　　　9 TAX -3.927853 　　　　0 CRIM -3.924537 　　　　4 NOX -3.311751 　　　　2 INDUS -0.303527 　　　　6 AGE 1.106992 　　　　1 ZN 2.933184 　　　　3 CHAS 3.072527 　　　　8 RAD 4.119741 　　　　11 B 5.686851 　　　　5 RM 23.888114 　　　　>>> print(clf2_SGD.intercept_) # 切片　　　　[16.97465888] 　　　　>>> print(clf2_SGD.score(X2,Y)) # R^2 　　　　0.7250366947774292 　　　　 # plot　output 　　　　>>> plt.figure(1) 　　　　>>> plt.title('Normalization of X-data and used SGD') 　　　　>>> plt.xlabel('RM (number of rooms)', fontsize=14) 　　　　>>> plt.ylabel('PRICE (target)', fontsize=14) 　　　　>>> plt.scatter(X2.RM, Y, c='blue', label='Raw data') 　　　　>>> plt.scatter(X2.RM, clf2_SGD.predict(X2), c='red', label='Multiple regression analysis') 　　　　>>> plt.legend(loc='lower right', fontsize=12) 　　　　>>> #plt.show() 　　　　>>> plt.savefig("exampleClf2_SGD.png")

Wine Quality Data Set
・$ python3 winequality-red.py 　　　　#!/usr/bin/env python3 　　　　# 　　　　import pandas as pd 　　　　import numpy as np 　　　　import matplotlib.pyplot as plt 　　　　from sklearn.linear_model import LinearRegression 　　　　pd.set_option("display.max.columns", 100) 　　　　pd.set_option("display.max.rows", 100) 　　　　wine = pd.read_csv("winequality-red.csv",sep=";") 　　　　print("wine.shape->") 　　　　print(wine.shape) 　　　　print("wine.head->") 　　　　print(wine.head()) 　　　　print("wine.tail->") 　　　　print(wine.tail()) 　　　　wine_eccept_quality = wine.drop("quality", axis=1) 　　　　X = wine_eccept_quality.values 　　　　Y = wine['quality'].values 　　　　clf = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1) 　　　　clf.fit(X,Y) 　　　　print("pd.DataFrame->") 　　　　print(pd.DataFrame({"Name":wine_eccept_quality.columns, "Coefficients":clf.coef_}) 　　　　.sort_values(by='Coefficients') ) 　　　　#print("clf.coef_->", clf.coef_) 　　　　print("clf.intercept_->", clf.intercept_) 　　　　print("clf.score->", clf.score(X,Y)) 　　　　print("End") ・winequality-red.py実行結果　　　　wine.shape-> 　　　　(1599, 12) 　　　　wine.head-> 　　　　 fixed acidity volatile acidity citric acid residual sugar chlorides \ 　　　　0 7.4 0.70 0.00 1.9 0.076 　　　　1 7.8 0.88 0.00 2.6 0.098 　　　　2 7.8 0.76 0.04 2.3 0.092 　　　　3 11.2 0.28 0.56 1.9 0.075 　　　　4 7.4 0.70 0.00 1.9 0.076 　　　　 free sulfur dioxide total sulfur dioxide density pH sulphates \ 　　　　0 11.0 34.0 0.9978 3.51 0.56 　　　　1 25.0 67.0 0.9968 3.20 0.68 　　　　2 15.0 54.0 0.9970 3.26 0.65 　　　　3 17.0 60.0 0.9980 3.16 0.58 　　　　4 11.0 34.0 0.9978 3.51 0.56 　　　　 alcohol quality 　　　　0 9.4 5 　　　　1 9.8 5 　　　　2 9.8 5 　　　　3 9.8 6 　　　　4 9.4 5 　　　　wine.tail-> 　　　　 fixed acidity volatile acidity citric acid residual sugar chlorides \ 　　　　1594 6.2 0.600 0.08 2.0 0.090 　　　　1595 5.9 0.550 0.10 2.2 0.062 　　　　1596 6.3 0.510 0.13 2.3 0.076 　　　　1597 5.9 0.645 0.12 2.0 0.075 　　　　1598 6.0 0.310 0.47 3.6 0.067 　　　　 free sulfur dioxide total sulfur dioxide density pH sulphates \ 　　　　1594 32.0 44.0 0.99490 3.45 0.58 　　　　1595 39.0 51.0 0.99512 3.52 0.76 　　　　1596 29.0 40.0 0.99574 3.42 0.75 　　　　1597 32.0 44.0 0.99547 3.57 0.71 　　　　1598 18.0 42.0 0.99549 3.39 0.66 　　　　 alcohol quality 　　　　1594 10.5 5 　　　　1595 11.2 6 　　　　1596 11.0 6 　　　　1597 10.2 5 　　　　1598 11.0 6 　　　　pd.DataFrame-> 　　　　 Name Coefficients 　　　　7 density -17.881164 　　　　4 chlorides -1.874225 　　　　1 volatile acidity -1.083590 　　　　8 pH -0.413653 　　　　2 citric acid -0.182564 　　　　6 total sulfur dioxide -0.003265 　　　　5 free sulfur dioxide 0.004361 　　　　3 residual sugar 0.016331 　　　　0 fixed acidity 0.024991 　　　　10 alcohol 0.276198 　　　　9 sulphates 0.916334 　　　　clf.intercept_-> 21.965208449451552 　　　　clf.score-> 0.36055170303868855 　　　　End ・ワインの品質スコアの回帰式　　　　[quality] = -17.881164 * [density] + -1.874225 * [chlorides] + 　　　　 -1.083590 * [volatile acidity] + -0.413653 * [pH] + 　　　　 -0.182564 * [citric acid] + 0.016331 * [residual sugar] + 　　　　 0.004361 * [free sulfur dioxide] + -1.874225 * [chlorides] + 　　　　 0.024991 * [fixed acidity] + 0.276198 * [alcohol] + 　　　　 0.916334 * [sulphates] + 21.965208449451552
・$ python3 winequality-red_Norm.py （各変数を正規化して重回帰分析）　　　　#!/usr/bin/env python3 　　　　# 　　　　import pandas as pd 　　　　import numpy as np 　　　　import matplotlib.pyplot as plt 　　　　from sklearn.linear_model import LinearRegression 　　　　from sklearn.preprocessing import MinMaxScaler 　　　　pd.set_option("display.max.columns", 100) 　　　　pd.set_option("display.max.rows", 100) 　　　　wine = pd.read_csv("winequality-red.csv",sep=";") 　　　　#print("wine->") 　　　　#print(wine) 　　　　wine2 = wine.apply(lambda x:(x - np.mean(x)) / (np.max(x) - np.min(x))) 　　　　wine2.head() 　　　　wine2_eccept_quality = wine2.drop("quality", axis=1) 　　　　X = wine2_eccept_quality.values 　　　　Y = wine2['quality'].values 　　　　clf = LinearRegression(fit_intercept=True, normalize=True, copy_X=True, n_jobs=1) 　　　　clf.fit(X,Y) 　　　　print("pd.DataFrame->") 　　　　print(pd.DataFrame({"Name":wine2_eccept_quality.columns, 　　　　 "Coefficients":np.abs(clf.coef_)}).sort_values(by='Coefficients') ) 　　　　print("clf.coef_->", clf.coef_) 　　　　print("clf.intercept_->", clf.intercept_) 　　　　print("clf.score->", clf.score(X,Y)) 　　　　print("End") ・winequality-red_Norm.py実行結果　　　　pd.DataFrame-> 　　　　 Name Coefficients 　　　　2 citric acid 0.036513 　　　　3 residual sugar 0.047687 　　　　7 density 0.048708 　　　　0 fixed acidity 0.056479 　　　　5 free sulfur dioxide 0.061931 　　　　8 pH 0.105068 　　　　6 total sulfur dioxide 0.184775 　　　　4 chlorides 0.224532 　　　　9 sulphates 0.306056 　　　　1 volatile acidity 0.316408 　　　　10 alcohol 0.359057 　　　　clf.coef_-> [ 0.05647865 -0.31640836 -0.03651279 0.04768731 -0.22453217 0.06193093 　　　　 -0.18477521 -0.04870829 -0.1050679 0.30605569 0.35905701] 　　　　clf.intercept_-> 1.9140742913275685e-16 　　　　clf.score-> 0.3605517030386882 　　　　End ・ワインの品質への各変数の影響度を、偏回帰係数の大小で比較
「alcohol（アルコール度数）」が品質に大きな影響を与えている。


All Rights Reserved. Copyright (C) ITCL