# https://stackabuse.com/classification-in-python-with-scikit-learn-and-pandas/
# need to split dataset for proper fitting and validation
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
# 机器学习分类预测的一个基本框架:
# 1 读取数据,这个例子包含50个传感器,220320条数据。数据清洗,填空,转换。
# 2 将数据传入几个经典算法,就可以得到预测方程和匹配指数。
# 3 具体的工作在于提高数据质量和核实实用性。历史数据至少需要两分好实际反映匹配率。
df = pd.read_csv('23-pump-sensor.csv', sep=',', header=0)
#pd.unique(df['sensor_15'])
df['sensor_15'].value_counts()
Series([], Name: count, dtype: int64)
df = df.drop([df.columns[0],df.columns[1],'sensor_15'], axis=1)
#pd.unique(df['machine_status'])
df['machine_status'].value_counts()
machine_status NORMAL 205836 RECOVERING 14477 BROKEN 7 Name: count, dtype: int64
df['machine_status'].replace(['NORMAL', 'RECOVERING','BROKEN'], [0,1,1], inplace=True)
dfmean = df.mean(axis=0) #column average
df = df.fillna(dfmean)
df.head()
sensor_00 | sensor_01 | sensor_02 | sensor_03 | sensor_04 | sensor_05 | sensor_06 | sensor_07 | sensor_08 | sensor_09 | ... | sensor_43 | sensor_44 | sensor_45 | sensor_46 | sensor_47 | sensor_48 | sensor_49 | sensor_50 | sensor_51 | machine_status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2.465394 | 47.09201 | 53.2118 | 46.310760 | 634.3750 | 76.45975 | 13.41146 | 16.13136 | 15.56713 | 15.05353 | ... | 41.92708 | 39.641200 | 65.68287 | 50.92593 | 38.194440 | 157.9861 | 67.70834 | 243.0556 | 201.3889 | 0 |
1 | 2.465394 | 47.09201 | 53.2118 | 46.310760 | 634.3750 | 76.45975 | 13.41146 | 16.13136 | 15.56713 | 15.05353 | ... | 41.92708 | 39.641200 | 65.68287 | 50.92593 | 38.194440 | 157.9861 | 67.70834 | 243.0556 | 201.3889 | 0 |
2 | 2.444734 | 47.35243 | 53.2118 | 46.397570 | 638.8889 | 73.54598 | 13.32465 | 16.03733 | 15.61777 | 15.01013 | ... | 41.66666 | 39.351852 | 65.39352 | 51.21528 | 38.194443 | 155.9606 | 67.12963 | 241.3194 | 203.7037 | 0 |
3 | 2.460474 | 47.09201 | 53.1684 | 46.397568 | 628.1250 | 76.98898 | 13.31742 | 16.24711 | 15.69734 | 15.08247 | ... | 40.88541 | 39.062500 | 64.81481 | 51.21528 | 38.194440 | 155.9606 | 66.84028 | 240.4514 | 203.1250 | 0 |
4 | 2.445718 | 47.13541 | 53.2118 | 46.397568 | 636.4583 | 76.58897 | 13.35359 | 16.21094 | 15.69734 | 15.08247 | ... | 41.40625 | 38.773150 | 65.10416 | 51.79398 | 38.773150 | 158.2755 | 66.55093 | 242.1875 | 201.3889 | 0 |
5 rows × 52 columns
import sklearn as sk
from sklearn.linear_model import LogisticRegression
y = df.iloc[:,-1]
X = df.iloc[:,:-1]
LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X, y)
round(LR.score(X,y), 4)
0.9959
LR.predict(X.iloc[-1:, :])
array([0])
X.iloc[-1:, :]
sensor_00 | sensor_01 | sensor_02 | sensor_03 | sensor_04 | sensor_05 | sensor_06 | sensor_07 | sensor_08 | sensor_09 | ... | sensor_42 | sensor_43 | sensor_44 | sensor_45 | sensor_46 | sensor_47 | sensor_48 | sensor_49 | sensor_50 | sensor_51 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
220319 | 2.396528 | 47.69965 | 50.520832 | 43.142361 | 639.8148 | 65.45634 | 15.11863 | 16.6522 | 15.65393 | 15.01013 | ... | 30.208332 | 41.40625 | 62.78935 | 46.2963 | 48.90046 | 40.21991 | 227.4306 | 150.463 | 183.04926 | 234.0856 |
1 rows × 51 columns
y.iloc[-1]
0
from sklearn import svm
SVM = svm.LinearSVC()
SVM.fit(X, y)
SVM.predict(X.iloc[-1:,:])
round(SVM.score(X,y), 4)
0.9965
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
RF.fit(X, y)
RF.predict(X.iloc[-1:,:])
round(RF.score(X,y), 4)
0.9946
RF.predict(X.iloc[-1:,:])
array([0])
from sklearn.neural_network import MLPClassifier
NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
NN.fit(X, y)
NN.predict(X.iloc[-1:,:])
round(NN.score(X,y), 4)
0.9938