In [83]:
# https://stackabuse.com/classification-in-python-with-scikit-learn-and-pandas/
# need to split dataset for proper fitting and validation
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
In [84]:
# 机器学习分类预测的一个基本框架:
# 1 读取数据,这个例子包含50个传感器,220320条数据。数据清洗,填空,转换。
# 2 将数据传入几个经典算法,就可以得到预测方程和匹配指数。
# 3 具体的工作在于提高数据质量和核实实用性。历史数据至少需要两分好实际反映匹配率。
df = pd.read_csv('23-pump-sensor.csv', sep=',', header=0)
In [85]:
#pd.unique(df['sensor_15'])
df['sensor_15'].value_counts()
Out[85]:
Series([], Name: count, dtype: int64)
In [86]:
df = df.drop([df.columns[0],df.columns[1],'sensor_15'], axis=1)
In [87]:
#pd.unique(df['machine_status'])
df['machine_status'].value_counts()
Out[87]:
machine_status
NORMAL        205836
RECOVERING     14477
BROKEN             7
Name: count, dtype: int64
In [88]:
df['machine_status'].replace(['NORMAL', 'RECOVERING','BROKEN'], [0,1,1], inplace=True)
In [89]:
dfmean = df.mean(axis=0) #column average
In [90]:
df = df.fillna(dfmean)
In [91]:
df.head()
Out[91]:
sensor_00 sensor_01 sensor_02 sensor_03 sensor_04 sensor_05 sensor_06 sensor_07 sensor_08 sensor_09 ... sensor_43 sensor_44 sensor_45 sensor_46 sensor_47 sensor_48 sensor_49 sensor_50 sensor_51 machine_status
0 2.465394 47.09201 53.2118 46.310760 634.3750 76.45975 13.41146 16.13136 15.56713 15.05353 ... 41.92708 39.641200 65.68287 50.92593 38.194440 157.9861 67.70834 243.0556 201.3889 0
1 2.465394 47.09201 53.2118 46.310760 634.3750 76.45975 13.41146 16.13136 15.56713 15.05353 ... 41.92708 39.641200 65.68287 50.92593 38.194440 157.9861 67.70834 243.0556 201.3889 0
2 2.444734 47.35243 53.2118 46.397570 638.8889 73.54598 13.32465 16.03733 15.61777 15.01013 ... 41.66666 39.351852 65.39352 51.21528 38.194443 155.9606 67.12963 241.3194 203.7037 0
3 2.460474 47.09201 53.1684 46.397568 628.1250 76.98898 13.31742 16.24711 15.69734 15.08247 ... 40.88541 39.062500 64.81481 51.21528 38.194440 155.9606 66.84028 240.4514 203.1250 0
4 2.445718 47.13541 53.2118 46.397568 636.4583 76.58897 13.35359 16.21094 15.69734 15.08247 ... 41.40625 38.773150 65.10416 51.79398 38.773150 158.2755 66.55093 242.1875 201.3889 0

5 rows × 52 columns

In [92]:
import sklearn as sk
from sklearn.linear_model import LogisticRegression
In [93]:
y = df.iloc[:,-1]
X = df.iloc[:,:-1]
In [94]:
LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X, y)
round(LR.score(X,y), 4)
Out[94]:
0.9959
In [95]:
LR.predict(X.iloc[-1:, :])
Out[95]:
array([0])
In [96]:
X.iloc[-1:, :]
Out[96]:
sensor_00 sensor_01 sensor_02 sensor_03 sensor_04 sensor_05 sensor_06 sensor_07 sensor_08 sensor_09 ... sensor_42 sensor_43 sensor_44 sensor_45 sensor_46 sensor_47 sensor_48 sensor_49 sensor_50 sensor_51
220319 2.396528 47.69965 50.520832 43.142361 639.8148 65.45634 15.11863 16.6522 15.65393 15.01013 ... 30.208332 41.40625 62.78935 46.2963 48.90046 40.21991 227.4306 150.463 183.04926 234.0856

1 rows × 51 columns

In [53]:
y.iloc[-1]
Out[53]:
0
In [97]:
from sklearn import svm
SVM = svm.LinearSVC()
SVM.fit(X, y)
SVM.predict(X.iloc[-1:,:])
round(SVM.score(X,y), 4)
Out[97]:
0.9965
In [98]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
RF.fit(X, y)
RF.predict(X.iloc[-1:,:])
round(RF.score(X,y), 4)
Out[98]:
0.9946
In [56]:
RF.predict(X.iloc[-1:,:])
Out[56]:
array([0])
In [99]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
NN.fit(X, y)
NN.predict(X.iloc[-1:,:])
round(NN.score(X,y), 4)
Out[99]:
0.9938