12.模型选择、网格搜索(参数调整)、模型的评估指标
时间:2018-08-31 18:00:41来源:杰瑞文章网点击:作文字数:500字
作文导读:一、回顾
adaboosting推导过程:https://www.cnblogs.com/liuwu265/p/4692347.html
k-means:本质是是聚类算法,但是可以进行降维
模型的选择:交叉验证
二、模型选择
这种方式叫做:k-fold
image.png
QQ图片20200407101842.png
kfold k折交叉验证
rkfold--repeatekfold,多了重复
leaveroneout留一法:留一个验证,其他进行训练
leave p out(lpo)---指定留多少个进行验证
shufflesplit打乱顺序,划分
Stratified k-fold--按照比例划分(kfold没有按照类别的比例划分,这个是按照比例划分的)
交叉验证--多种方式代码:
from sklearn.model_selection import KFold,cross_val_score
import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
X,y = datasets.load_iris(True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 1008)
KFold k折交叉验证
kFold = KFold(n_splits=5,shuffle=False)#打乱顺序,分成5份
for train,test in kFold.split(X_train,y_train):
print('训练数据的索引:',train)#索引
print('验证数据的索引:',test)#索引
训练数据的索引: [ 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
36 37 38 39 40 41 63 64 65 66 67 68 69 70 71 72 73 74
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
54 55 56 57 58 59 60 61 62 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
72 73 74 75 76 77 78 79 80 81 82 83]
验证数据的索引: [ 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
102 103 104]
import warnings
warnings.filterwarnings('ignore')
cross_val_score(LogisticRegression(),X_train,y_train,cv = KFold())
array([1. , 0.9047619 , 0.95238095, 1. , 0.9047619 ])
cross_val_score(LogisticRegression(),X_train,y_train,cv = KFold())
array([1. , 0.9047619 , 0.95238095, 1. , 0.9047619 ])
lr = LogisticRegression()
kFold = KFold(n_splits=5)
scores = []
for train,validation in kFold.split(X_train,y_train):
lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据
scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8))
print(scores)
[1.0, 0.9047619, 0.95238095, 1.0, 0.9047619]
重复K折交叉验证
from sklearn.model_selection import RepeatedKFold
rKFold = RepeatedKFold(n_repeats=3)
for train,validation in rKFold.split(X_train,y_train):
print('----------------')
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
lr = LogisticRegression()
rkFold = RepeatedKFold(n_splits=5,n_repeats=3)
scores = []
for train,validation in rkFold.split(X_train,y_train):
lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据
scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8))
print(scores)
print(len(scores))
[1.0, 1.0, 0.95238095, 1.0, 0.85714286, 1.0, 1.0, 0.95238095, 0.9047619, 0.95238095, 0.9047619, 1.0, 0.95238095, 1.0, 0.95238095]
15
留一法
from sklearn.model_selection import LeaveOneOut
lo = LeaveOneOut()
i = 1
for train,test in lo.split(X_train,y_train):
print(i)
i +=1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
打乱顺序的交叉验证
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=5,test_size = 0.2)
ss.split(X_train,y_train)
from sklearn.neighbors import KNeighborsClassifier
cross_val_score(estimator=KNeighborsClassifier(),X = X_train,y = y_train,cv = ss)
array([0.9047619 , 1. , 0.9047619 , 0.95238095, 1. ])
cross_val_score(estimator=KNeighborsClassifier(),X = X_train,y = y_train,cv = RepeatedKFold())
array([0.95238095, 0.95238095, 0.80952381, 0.9047619 , 0.95238095,
1. , 0.95238095, 0.95238095, 0.85714286, 1. ,
0.95238095, 1. , 0.9047619 , 0.95238095, 0.80952381,
0.95238095, 0.95238095, 0.95238095, 1. , 0.9047619 ,
0.9047619 , 1. , 0.9047619 , 0.95238095, 0.95238095,
1. , 0.95238095, 1. , 0.9047619 , 0.95238095,
0.95238095, 0.95238095, 0.95238095, 1. , 0.95238095,
0.95238095, 0.9047619 , 0.95238095, 0.95238095, 0.85714286,
0.9047619 , 1. , 0.95238095, 0.95238095, 0.95238095,
0.9047619 , 0.95238095, 1. , 0.95238095, 0.95238095])
分层交叉验证
# 分层K折交叉验证
from sklearn.model_selection import StratifiedKFold
X = np.random.randint(0,10,size = (8,2))
y = np.array([0,1,0,1,1,1,0,0])
display(X,y)
array([[7, 2],
[3, 0],
[3, 1],
[8, 6],
[3, 1],
[7, 5],
[5, 6],
[9, 6]])
array([0, 1, 0, 1, 1, 1, 0, 0])
kFold = KFold(n_splits=4,shuffle=True)
for train,test in kFold.split(X,y):
print(y[train],y[test])
[0 0 1 1 0 0] [1 1]
[0 1 0 1 1 0] [1 0]
[0 1 1 1 0 0] [0 1]
[1 0 1 1 1 0] [0 0]
sKFold = StratifiedKFold(n_splits=4,shuffle=True)
for train,test in sKFold.split(X,y):
print(y[train],y[test])
[0 1 1 1 0 0] [0 1]
[0 1 1 1 0 0] [0 1]
[0 1 0 1 1 0] [1 0]
[0 1 0 1 1 0] [1 0]
sKFold = StratifiedKFold(n_splits=4,shuffle=True)
for train,test in sKFold.split(X,y):
print(y[train],y[test])
[1 0 1 1 0 0] [0 1]
[0 1 0 1 1 0] [1 0]
[0 1 1 1 0 0] [1 0]
[0 1 0 1 1 0] [1 0]
lr = LogisticRegression()
skFold = StratifiedKFold(n_splits=5)
scores = []
for train,validation in skFold.split(X_train,y_train):
lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据
scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8))
print(scores)
print(len(scores))
[1.0, 0.85714286, 1.0, 1.0, 0.95238095]
5
三、网格搜索--参数得调整
(1)GridSearchCV(调整模型的参数)中包扣交叉验证
metrices模型评估,指标这里面有accuracy_score这个就是准确率的计算
LogisticRegression()中有一个pentaly中的系数是惩罚项值是L1和L2其中的一个。
* 网格交叉--筛选模型的最优参数
* 交叉就是不同参数得组合--筛选出最优的参数
* 所有的组合可能都会进行筛选
网格交叉.png
GridSearchCV代码:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.linear_model import LogisticRegression
# GridSearchCV中,包含交叉验证!
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.metrics import accuracy_score#准确率
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
X,y = datasets.load_wine(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
%%time
# pentaly :惩罚项,就是正则项,乘法系数,不能让系数太大,系数太大,容易过拟合(乘法,缩小)
# tol tollerance 容忍误差,精确度一个意思
# C 惩罚项的系数(越大正则化越弱),相当于领回归的alpha(越大,正则化越强)
# 参数组合种类:2*4*6 = 48种组合
hp = {'penalty':['l2','l1'],'tol':[0.001,0.005,0.0001,0.00001],'C':[0.1,0.5,1.0,5,10,1000],'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
lr = LogisticRegression()
# 参数组合,才能找到最优的参数组合
clf = GridSearchCV(lr,param_grid=hp,scoring='accuracy',cv = StratifiedKFold(n_splits=5,shuffle=True))
clf.fit(X_train,y_train)
Wall time: 25.2 s
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
error_score=nan,
estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs',
tol=0.0001, verbose=0,
warm_start=False),
iid='deprecated', n_jobs=None,
param_grid={'C': [0.1, 0.5, 1.0, 5, 10, 1000],
'penalty': ['l2', 'l1'],
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
'saga'],
'tol': [0.001, 0.005, 0.0001, 1e-05]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=0)
# 查看最合适参数
clf.best_params_
{'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
clf.best_score_
0.9721674876847292
# estimator 估计者 == 模型 == 算法 == model
# 机器学习, 数据挖掘中,所有的算法,都是估计!
clf.best_estimator_
LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l1',
random_state=None, solver='liblinear', tol=0.001, verbose=0,
warm_start=False)
预测,直接使用GridSearchCV
clf.predict(X_test)
array([2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 0, 2, 2, 1, 1, 0, 1, 1, 2, 0, 1, 1,
0, 1, 1, 2, 0, 2, 0, 0, 0, 1, 0, 2, 2, 1])
# 真实的测试数据的分数,低一些,说明:一定程度过拟合
clf.score(X_test,y_test)
0.9166666666666666
# 真实的测试数据的分数,低一些,说明:一定程度过拟合
clf.score(X_test,y_test)
0.9444444444444444
# 默认参数,预测,准确率,稍微低一些!
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.8888888888888888
(2)RandomizedSearchCv
参数:优化的有,惩罚项,正则化的类型
惩罚项就是减小系数,惩罚项就是正则项,防止过度拟合。
GridSearchCV要选的参数时固定的,RandomizedSearchCv选择的参数不是固定的,是随机生成的,筛选合适的参数。
通过n_inter来调整筛选的次数
系数的选择.png
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
iris = load_iris()
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(iris.data, iris.target)
search.best_params_
#查看筛选的最佳的系数
RandomizedSearchCv代码:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
# 并没有生成,具体的数值
# 生成的数据,从0到4之间的uniform分布
u = uniform(loc = 0,scale=4)
u
X,y = datasets.load_wine(True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
# solver 最基本的有梯度下降,随机梯度下降……
lr = LogisticRegression(tol = 1e-4,max_iter=200)
# 参数,优化有:惩罚项、正则化的类型
hp = dict(C = uniform(loc = 0,scale = 4),penalty = ['l1','l2'],
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
# 随机选择,不像,GridSearchCV(要选择的参数是固定的)
clf = RandomizedSearchCV(lr,hp,n_iter=100)#要选的参数,不是固定的,随机生成,筛选合适,0.2416
clf.fit(X_train,y_train)
display(clf.best_params_,clf.best_score_,clf.score(X_test,y_test))
{'C': 3.0556422672340506, 'penalty': 'l2', 'solver': 'newton-cg'}
0.9719211822660098
0.9444444444444444
(3)回归参数得调整
与分类的使用类似。
import numpy as np
from sklearn.linear_model import Ridge
from sklearn import datasets
from sklearn.model_selection import train_test_split,GridSearchCV
X,y = datasets.load_boston(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
%%time
hp = dict(alpha = np.arange(0.1,5,0.2),
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
ridge = Ridge()
clf = GridSearchCV(estimator=ridge,param_grid=hp,scoring='r2')
clf.fit(X_train,y_train)
Wall time: 12.2 s
GridSearchCV(cv=None, error_score=nan,
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001),
iid='deprecated', n_jobs=None,
param_grid={'alpha': array([0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5,
2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9]),
'solver': ['auto', 'svd', 'cholesky', 'lsqr',
'sparse_cg', 'sag', 'saga']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='r2', verbose=0)
clf.best_score_
0.7175459507572871
clf.best_params_
{'alpha': 0.1, 'solver': 'auto'}
clf.best_estimator_
Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001)
ridge = Ridge(alpha = 0.1)
ridge.fit(X_train,y_train)
ridge.score(X_test,y_test)
0.7150363841784935
from sklearn.metrics import r2_score
y_ = ridge.predict(X_test)
r2_score(y_test,y_)
0.7150363841784935
1 - ((y_test - y_)**2).sum()/(((y_test - y_test.mean())**2).sum())
0.7150363841784935
四、SAG/SGD算法
SAG.png
五、模型的评估
模型评估就是对比真实值和预测值之间的差异
模型的评估.png
三种评价标准.png
有一些情况下:准确率是不足以评价模型好坏的。
举例:发明一个模型,诊断癌症,人群中是只有少数是患病的。如果模型诊断健康的人是百分之九十九,那么这个模型是没有意义的。
模型大多的情况下都是去找少数的例子。
银行贷款:大多数都是信用好的,那么算法大多数都是找到信用不好用的人。
所以就有了这些评估模型的方法:
一、分类问题模型评价指标:
(1)二分类问题:
计算的方式.png
预测类别.png
预测类别2.png
准确率accuracy:
image.png
精确率precision:
image.png
默认情况下,精确率计算的是二分类的问题
召回率recall:
image.png
召回率越高,说明模型想要找到正样本数据的能力越强。
当样本不均衡的时候,使用召回率越好
F-measure:
image.png
f值就是精确率和准确率的调和平均值
二分类代码:
(2)多分类问题(模型评估)
使用精确率和召回率。
混淆矩阵
image.png
image.png
多分类使用上面的精确率计算方法是会报错的:它只是适用于二分类的问题
使用两种方式计算精确率
image.png
marco的计算方式:类别分别计算
再加和。
使用两种方式计算召回率:
image.png
image.png
当然多分类问题的精确率和召回率都是可以自己手动计算的,详见代码。
多分类代码:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import recall_score,precision_score,accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.metrics import confusion_matrix
import pandas as pd
二分类,精确率和召回率计算
X,y = datasets.load_iris(True)
X2 = X[y!=0]
y2 = y[y!=0]
y2
X_train,X_test,y_train,y_test = train_test_split(X2,y2,test_size = 0.6)
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
y_pred
array([1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1,
1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1])
confusion_matrix(y_test,y_pred)
array([[27, 0],
[10, 23]], dtype=int64)
# 召回率
recall_score(y_test,y_pred)
1.0
26/(26 + 3)
0.896551724137931
# 精确率
precision_score(y_test,y_pred)
0.7297297297297297
accuracy_score(y_test,y_pred)
0.8333333333333334
26/(26 +2)
0.9285714285714286
多分类,精确率和召回率计算
X,y = datasets.load_wine(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.6)
lr = LogisticRegression(C = 0.01)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
# 混淆矩阵
confusion_matrix(y_test,y_pred)
array([[27, 4, 1],
[ 2, 41, 1],
[ 1, 1, 29]], dtype=int64)
pd.crosstab(index = y_test,columns=y_pred,margins=True,rownames=['真实'],colnames=['预测'])
多分类计算精确率
accuracy_score(y_test,y_pred)
0.9065420560747663
# 计算多分类的精确率
# 默认情况下,精确率计算的是二分类情况
precision_score(y_test,y_pred,average='micro')
0.9065420560747663
# 所有正确的样本和除以总样本数量
(27+41+29)/107
0.9065420560747663
precision_score(y_test,y_pred,average='macro')
0.9089294062646096
(27/30 + 41/46 + 29/31)/3
0.9089294062646096
多分类,计算召回率
recall_score(y_test,y_pred,average='micro')
0.9065420560747663
(27+41+29)/107
0.9065420560747663
recall_score(y_test,y_pred,average='macro')
0.9036840175953079
(27/32+41/44+29/31)/3
0.9036840175953079
一、回顾
adaboosting推导过程:https://www.cnblogs.com/liuwu265/p/4692347.html
k-means:本质是是聚类算法,但是可以进行降维
模型的选择:交叉验证
二、模型选择
这种方式叫做:k-fold
image.png
QQ图片20200407101842.png
kfold k折交叉验证
rkfold--repeatekfold,多了重复
leaveroneout留一法:留一个验证,其他进行训练
leave p out(lpo)---指定留多少个进行验证
shufflesplit打乱顺序,划分
Stratified k-fold--按照比例划分(kfold没有按照类别的比例划分,这个是按照比例划分的)
交叉验证--多种方式代码:
from sklearn.model_selection import KFold,cross_val_score
import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
X,y = datasets.load_iris(True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 1008)
KFold k折交叉验证
kFold = KFold(n_splits=5,shuffle=False)#打乱顺序,分成5份
for train,test in kFold.split(X_train,y_train):
print('训练数据的索引:',train)#索引
print('验证数据的索引:',test)#索引
训练数据的索引: [ 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
36 37 38 39 40 41 63 64 65 66 67 68 69 70 71 72 73 74
75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
54 55 56 57 58 59 60 61 62 84 85 86 87 88 89 90 91 92
93 94 95 96 97 98 99 100 101 102 103 104]
验证数据的索引: [63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83]
训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
72 73 74 75 76 77 78 79 80 81 82 83]
验证数据的索引: [ 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
102 103 104]
import warnings
warnings.filterwarnings('ignore')
cross_val_score(LogisticRegression(),X_train,y_train,cv = KFold())
array([1. , 0.9047619 , 0.95238095, 1. , 0.9047619 ])
cross_val_score(LogisticRegression(),X_train,y_train,cv = KFold())
array([1. , 0.9047619 , 0.95238095, 1. , 0.9047619 ])
lr = LogisticRegression()
kFold = KFold(n_splits=5)
scores = []
for train,validation in kFold.split(X_train,y_train):
lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据
scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8))
print(scores)
[1.0, 0.9047619, 0.95238095, 1.0, 0.9047619]
重复K折交叉验证
from sklearn.model_selection import RepeatedKFold
rKFold = RepeatedKFold(n_repeats=3)
for train,validation in rKFold.split(X_train,y_train):
print('----------------')
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
----------------
lr = LogisticRegression()
rkFold = RepeatedKFold(n_splits=5,n_repeats=3)
scores = []
for train,validation in rkFold.split(X_train,y_train):
lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据
scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8))
print(scores)
print(len(scores))
[1.0, 1.0, 0.95238095, 1.0, 0.85714286, 1.0, 1.0, 0.95238095, 0.9047619, 0.95238095, 0.9047619, 1.0, 0.95238095, 1.0, 0.95238095]
15
留一法
from sklearn.model_selection import LeaveOneOut
lo = LeaveOneOut()
i = 1
for train,test in lo.split(X_train,y_train):
print(i)
i +=1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
打乱顺序的交叉验证
from sklearn.model_selection import ShuffleSplit
ss = ShuffleSplit(n_splits=5,test_size = 0.2)
ss.split(X_train,y_train)
from sklearn.neighbors import KNeighborsClassifier
cross_val_score(estimator=KNeighborsClassifier(),X = X_train,y = y_train,cv = ss)
array([0.9047619 , 1. , 0.9047619 , 0.95238095, 1. ])
cross_val_score(estimator=KNeighborsClassifier(),X = X_train,y = y_train,cv = RepeatedKFold())
array([0.95238095, 0.95238095, 0.80952381, 0.9047619 , 0.95238095,
1. , 0.95238095, 0.95238095, 0.85714286, 1. ,
0.95238095, 1. , 0.9047619 , 0.95238095, 0.80952381,
0.95238095, 0.95238095, 0.95238095, 1. , 0.9047619 ,
0.9047619 , 1. , 0.9047619 , 0.95238095, 0.95238095,
1. , 0.95238095, 1. , 0.9047619 , 0.95238095,
0.95238095, 0.95238095, 0.95238095, 1. , 0.95238095,
0.95238095, 0.9047619 , 0.95238095, 0.95238095, 0.85714286,
0.9047619 , 1. , 0.95238095, 0.95238095, 0.95238095,
0.9047619 , 0.95238095, 1. , 0.95238095, 0.95238095])
分层交叉验证
# 分层K折交叉验证
from sklearn.model_selection import StratifiedKFold
X = np.random.randint(0,10,size = (8,2))
y = np.array([0,1,0,1,1,1,0,0])
display(X,y)
array([[7, 2],
[3, 0],
[3, 1],
[8, 6],
[3, 1],
[7, 5],
[5, 6],
[9, 6]])
array([0, 1, 0, 1, 1, 1, 0, 0])
kFold = KFold(n_splits=4,shuffle=True)
for train,test in kFold.split(X,y):
print(y[train],y[test])
[0 0 1 1 0 0] [1 1]
[0 1 0 1 1 0] [1 0]
[0 1 1 1 0 0] [0 1]
[1 0 1 1 1 0] [0 0]
sKFold = StratifiedKFold(n_splits=4,shuffle=True)
for train,test in sKFold.split(X,y):
print(y[train],y[test])
[0 1 1 1 0 0] [0 1]
[0 1 1 1 0 0] [0 1]
[0 1 0 1 1 0] [1 0]
[0 1 0 1 1 0] [1 0]
sKFold = StratifiedKFold(n_splits=4,shuffle=True)
for train,test in sKFold.split(X,y):
print(y[train],y[test])
[1 0 1 1 0 0] [0 1]
[0 1 0 1 1 0] [1 0]
[0 1 1 1 0 0] [1 0]
[0 1 0 1 1 0] [1 0]
lr = LogisticRegression()
skFold = StratifiedKFold(n_splits=5)
scores = []
for train,validation in skFold.split(X_train,y_train):
lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据
scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8))
print(scores)
print(len(scores))
[1.0, 0.85714286, 1.0, 1.0, 0.95238095]
5
三、网格搜索--参数得调整
(1)GridSearchCV(调整模型的参数)中包扣交叉验证
metrices模型评估,指标这里面有accuracy_score这个就是准确率的计算
LogisticRegression()中有一个pentaly中的系数是惩罚项值是L1和L2其中的一个。
* 网格交叉--筛选模型的最优参数
* 交叉就是不同参数得组合--筛选出最优的参数
* 所有的组合可能都会进行筛选
网格交叉.png
GridSearchCV代码:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.linear_model import LogisticRegression
# GridSearchCV中,包含交叉验证!
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.metrics import accuracy_score#准确率
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
X,y = datasets.load_wine(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
%%time
# pentaly :惩罚项,就是正则项,乘法系数,不能让系数太大,系数太大,容易过拟合(乘法,缩小)
# tol tollerance 容忍误差,精确度一个意思
# C 惩罚项的系数(越大正则化越弱),相当于领回归的alpha(越大,正则化越强)
# 参数组合种类:2*4*6 = 48种组合
hp = {'penalty':['l2','l1'],'tol':[0.001,0.005,0.0001,0.00001],'C':[0.1,0.5,1.0,5,10,1000],'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
lr = LogisticRegression()
# 参数组合,才能找到最优的参数组合
clf = GridSearchCV(lr,param_grid=hp,scoring='accuracy',cv = StratifiedKFold(n_splits=5,shuffle=True))
clf.fit(X_train,y_train)
Wall time: 25.2 s
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True),
error_score=nan,
estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs',
tol=0.0001, verbose=0,
warm_start=False),
iid='deprecated', n_jobs=None,
param_grid={'C': [0.1, 0.5, 1.0, 5, 10, 1000],
'penalty': ['l2', 'l1'],
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag',
'saga'],
'tol': [0.001, 0.005, 0.0001, 1e-05]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='accuracy', verbose=0)
# 查看最合适参数
clf.best_params_
{'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
clf.best_score_
0.9721674876847292
# estimator 估计者 == 模型 == 算法 == model
# 机器学习, 数据挖掘中,所有的算法,都是估计!
clf.best_estimator_
LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='auto', n_jobs=None, penalty='l1',
random_state=None, solver='liblinear', tol=0.001, verbose=0,
warm_start=False)
预测,直接使用GridSearchCV
clf.predict(X_test)
array([2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 0, 2, 2, 1, 1, 0, 1, 1, 2, 0, 1, 1,
0, 1, 1, 2, 0, 2, 0, 0, 0, 1, 0, 2, 2, 1])
# 真实的测试数据的分数,低一些,说明:一定程度过拟合
clf.score(X_test,y_test)
0.9166666666666666
# 真实的测试数据的分数,低一些,说明:一定程度过拟合
clf.score(X_test,y_test)
0.9444444444444444
# 默认参数,预测,准确率,稍微低一些!
lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)
0.8888888888888888
(2)RandomizedSearchCv
参数:优化的有,惩罚项,正则化的类型
惩罚项就是减小系数,惩罚项就是正则项,防止过度拟合。
GridSearchCV要选的参数时固定的,RandomizedSearchCv选择的参数不是固定的,是随机生成的,筛选合适的参数。
通过n_inter来调整筛选的次数
系数的选择.png
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
iris = load_iris()
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(iris.data, iris.target)
search.best_params_
#查看筛选的最佳的系数
RandomizedSearchCv代码:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
# 并没有生成,具体的数值
# 生成的数据,从0到4之间的uniform分布
u = uniform(loc = 0,scale=4)
u
X,y = datasets.load_wine(True)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
# solver 最基本的有梯度下降,随机梯度下降……
lr = LogisticRegression(tol = 1e-4,max_iter=200)
# 参数,优化有:惩罚项、正则化的类型
hp = dict(C = uniform(loc = 0,scale = 4),penalty = ['l1','l2'],
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])
# 随机选择,不像,GridSearchCV(要选择的参数是固定的)
clf = RandomizedSearchCV(lr,hp,n_iter=100)#要选的参数,不是固定的,随机生成,筛选合适,0.2416
clf.fit(X_train,y_train)
display(clf.best_params_,clf.best_score_,clf.score(X_test,y_test))
{'C': 3.0556422672340506, 'penalty': 'l2', 'solver': 'newton-cg'}
0.9719211822660098
0.9444444444444444
(3)回归参数得调整
与分类的使用类似。
import numpy as np
from sklearn.linear_model import Ridge
from sklearn import datasets
from sklearn.model_selection import train_test_split,GridSearchCV
X,y = datasets.load_boston(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
%%time
hp = dict(alpha = np.arange(0.1,5,0.2),
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
ridge = Ridge()
clf = GridSearchCV(estimator=ridge,param_grid=hp,scoring='r2')
clf.fit(X_train,y_train)
Wall time: 12.2 s
GridSearchCV(cv=None, error_score=nan,
estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
max_iter=None, normalize=False, random_state=None,
solver='auto', tol=0.001),
iid='deprecated', n_jobs=None,
param_grid={'alpha': array([0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5,
2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9]),
'solver': ['auto', 'svd', 'cholesky', 'lsqr',
'sparse_cg', 'sag', 'saga']},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='r2', verbose=0)
clf.best_score_
0.7175459507572871
clf.best_params_
{'alpha': 0.1, 'solver': 'auto'}
clf.best_estimator_
Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
normalize=False, random_state=None, solver='auto', tol=0.001)
ridge = Ridge(alpha = 0.1)
ridge.fit(X_train,y_train)
ridge.score(X_test,y_test)
0.7150363841784935
from sklearn.metrics import r2_score
y_ = ridge.predict(X_test)
r2_score(y_test,y_)
0.7150363841784935
1 - ((y_test - y_)**2).sum()/(((y_test - y_test.mean())**2).sum())
0.7150363841784935
四、SAG/SGD算法
SAG.png
五、模型的评估
模型评估就是对比真实值和预测值之间的差异
模型的评估.png
三种评价标准.png
有一些情况下:准确率是不足以评价模型好坏的。
举例:发明一个模型,诊断癌症,人群中是只有少数是患病的。如果模型诊断健康的人是百分之九十九,那么这个模型是没有意义的。
模型大多的情况下都是去找少数的例子。
银行贷款:大多数都是信用好的,那么算法大多数都是找到信用不好用的人。
所以就有了这些评估模型的方法:
一、分类问题模型评价指标:
(1)二分类问题:
计算的方式.png
预测类别.png
预测类别2.png
准确率accuracy:
image.png
精确率precision:
image.png
默认情况下,精确率计算的是二分类的问题
召回率recall:
image.png
召回率越高,说明模型想要找到正样本数据的能力越强。
当样本不均衡的时候,使用召回率越好
F-measure:
image.png
f值就是精确率和准确率的调和平均值
二分类代码:
(2)多分类问题(模型评估)
使用精确率和召回率。
混淆矩阵
image.png
image.png
多分类使用上面的精确率计算方法是会报错的:它只是适用于二分类的问题
使用两种方式计算精确率
image.png
marco的计算方式:类别分别计算
再加和。
使用两种方式计算召回率:
image.png
image.png
当然多分类问题的精确率和召回率都是可以自己手动计算的,详见代码。
多分类代码:
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import recall_score,precision_score,accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.metrics import confusion_matrix
import pandas as pd
二分类,精确率和召回率计算
X,y = datasets.load_iris(True)
X2 = X[y!=0]
y2 = y[y!=0]
y2
X_train,X_test,y_train,y_test = train_test_split(X2,y2,test_size = 0.6)
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,y_train)
y_pred = knn.predict(X_test)
y_pred
array([1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1,
1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1])
confusion_matrix(y_test,y_pred)
array([[27, 0],
[10, 23]], dtype=int64)
# 召回率
recall_score(y_test,y_pred)
1.0
26/(26 + 3)
0.896551724137931
# 精确率
precision_score(y_test,y_pred)
0.7297297297297297
accuracy_score(y_test,y_pred)
0.8333333333333334
26/(26 +2)
0.9285714285714286
多分类,精确率和召回率计算
X,y = datasets.load_wine(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.6)
lr = LogisticRegression(C = 0.01)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
# 混淆矩阵
confusion_matrix(y_test,y_pred)
array([[27, 4, 1],
[ 2, 41, 1],
[ 1, 1, 29]], dtype=int64)
pd.crosstab(index = y_test,columns=y_pred,margins=True,rownames=['真实'],colnames=['预测'])
多分类计算精确率
accuracy_score(y_test,y_pred)
0.9065420560747663
# 计算多分类的精确率
# 默认情况下,精确率计算的是二分类情况
precision_score(y_test,y_pred,average='micro')
0.9065420560747663
# 所有正确的样本和除以总样本数量
(27+41+29)/107
0.9065420560747663
precision_score(y_test,y_pred,average='macro')
0.9089294062646096
(27/30 + 41/46 + 29/31)/3
0.9089294062646096
多分类,计算召回率
recall_score(y_test,y_pred,average='micro')
0.9065420560747663
(27+41+29)/107
0.9065420560747663
recall_score(y_test,y_pred,average='macro')
0.9036840175953079
(27/32+41/44+29/31)/3
0.9036840175953079

12.模型选择、网格搜索(参数调整)、模型的评估指标一文由杰瑞文章网免费提供,本站为公益性作文网站,此作文为网上收集或网友提供,版权归原作者所有,如果侵犯了您的权益,请及时与我们联系,我们会立即删除!
杰瑞文章网友情提示:请不要直接抄作文用来交作业。你可以学习、借鉴、期待你写出更好的作文。
说说你对这篇作文的看法吧