12.模型选择、网格搜索(参数调整)、模型的评估指标

时间:2018-08-31 18:00:41来源:杰瑞文章网点击:作文字数:500字
一、回顾 adaboosting推导过程:https://www.cnblogs.com/liuwu265/p/4692347.html k-means:本质是是聚类算法,但是可以进行降维 模型的选择:交叉验证 二、模型选择 这种方式叫做:k-fold image.png QQ图片20200407101842.png kfold k折交叉验证 rkfold--repeatekfold,多了重复 leaveroneout留一法:留一个验证,其他进行训练 leave p out(lpo)---指定留多少个进行验证 shufflesplit打乱顺序,划分 Stratified k-fold--按照比例划分(kfold没有按照类别的比例划分,这个是按照比例划分的) 交叉验证--多种方式代码: from sklearn.model_selection import KFold,cross_val_score import numpy as np from sklearn import datasets from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression X,y = datasets.load_iris(True) from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state = 1008) KFold k折交叉验证 kFold = KFold(n_splits=5,shuffle=False)#打乱顺序,分成5份 for train,test in kFold.split(X_train,y_train): print('训练数据的索引:',train)#索引 print('验证数据的索引:',test)#索引 训练数据的索引: [ 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104] 验证数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20] 训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104] 验证数据的索引: [21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41] 训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104] 验证数据的索引: [42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62] 训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104] 验证数据的索引: [63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83] 训练数据的索引: [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83] 验证数据的索引: [ 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104] import warnings warnings.filterwarnings('ignore') cross_val_score(LogisticRegression(),X_train,y_train,cv = KFold()) array([1. , 0.9047619 , 0.95238095, 1. , 0.9047619 ]) cross_val_score(LogisticRegression(),X_train,y_train,cv = KFold()) array([1. , 0.9047619 , 0.95238095, 1. , 0.9047619 ]) lr = LogisticRegression() kFold = KFold(n_splits=5) scores = [] for train,validation in kFold.split(X_train,y_train): lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据 scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8)) print(scores) [1.0, 0.9047619, 0.95238095, 1.0, 0.9047619] 重复K折交叉验证 from sklearn.model_selection import RepeatedKFold rKFold = RepeatedKFold(n_repeats=3) for train,validation in rKFold.split(X_train,y_train): print('----------------') ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- ---------------- lr = LogisticRegression() rkFold = RepeatedKFold(n_splits=5,n_repeats=3) scores = [] for train,validation in rkFold.split(X_train,y_train): lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据 scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8)) print(scores) print(len(scores)) [1.0, 1.0, 0.95238095, 1.0, 0.85714286, 1.0, 1.0, 0.95238095, 0.9047619, 0.95238095, 0.9047619, 1.0, 0.95238095, 1.0, 0.95238095] 15 留一法 from sklearn.model_selection import LeaveOneOut lo = LeaveOneOut() i = 1 for train,test in lo.split(X_train,y_train): print(i) i +=1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 打乱顺序的交叉验证 from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=5,test_size = 0.2) ss.split(X_train,y_train) from sklearn.neighbors import KNeighborsClassifier cross_val_score(estimator=KNeighborsClassifier(),X = X_train,y = y_train,cv = ss) array([0.9047619 , 1. , 0.9047619 , 0.95238095, 1. ]) cross_val_score(estimator=KNeighborsClassifier(),X = X_train,y = y_train,cv = RepeatedKFold()) array([0.95238095, 0.95238095, 0.80952381, 0.9047619 , 0.95238095, 1. , 0.95238095, 0.95238095, 0.85714286, 1. , 0.95238095, 1. , 0.9047619 , 0.95238095, 0.80952381, 0.95238095, 0.95238095, 0.95238095, 1. , 0.9047619 , 0.9047619 , 1. , 0.9047619 , 0.95238095, 0.95238095, 1. , 0.95238095, 1. , 0.9047619 , 0.95238095, 0.95238095, 0.95238095, 0.95238095, 1. , 0.95238095, 0.95238095, 0.9047619 , 0.95238095, 0.95238095, 0.85714286, 0.9047619 , 1. , 0.95238095, 0.95238095, 0.95238095, 0.9047619 , 0.95238095, 1. , 0.95238095, 0.95238095]) 分层交叉验证 # 分层K折交叉验证 from sklearn.model_selection import StratifiedKFold X = np.random.randint(0,10,size = (8,2)) y = np.array([0,1,0,1,1,1,0,0]) display(X,y) array([[7, 2], [3, 0], [3, 1], [8, 6], [3, 1], [7, 5], [5, 6], [9, 6]]) array([0, 1, 0, 1, 1, 1, 0, 0]) kFold = KFold(n_splits=4,shuffle=True) for train,test in kFold.split(X,y): print(y[train],y[test]) [0 0 1 1 0 0] [1 1] [0 1 0 1 1 0] [1 0] [0 1 1 1 0 0] [0 1] [1 0 1 1 1 0] [0 0] sKFold = StratifiedKFold(n_splits=4,shuffle=True) for train,test in sKFold.split(X,y): print(y[train],y[test]) [0 1 1 1 0 0] [0 1] [0 1 1 1 0 0] [0 1] [0 1 0 1 1 0] [1 0] [0 1 0 1 1 0] [1 0] sKFold = StratifiedKFold(n_splits=4,shuffle=True) for train,test in sKFold.split(X,y): print(y[train],y[test]) [1 0 1 1 0 0] [0 1] [0 1 0 1 1 0] [1 0] [0 1 1 1 0 0] [1 0] [0 1 0 1 1 0] [1 0] lr = LogisticRegression() skFold = StratifiedKFold(n_splits=5) scores = [] for train,validation in skFold.split(X_train,y_train): lr.fit(X_train[train],y_train[train])#根据训练数据的索引,取了数据 scores.append(np.round(lr.score(X_train[validation],y_train[validation]),8)) print(scores) print(len(scores)) [1.0, 0.85714286, 1.0, 1.0, 0.95238095] 5 三、网格搜索--参数得调整 (1)GridSearchCV(调整模型的参数)中包扣交叉验证 metrices模型评估,指标这里面有accuracy_score这个就是准确率的计算 LogisticRegression()中有一个pentaly中的系数是惩罚项值是L1和L2其中的一个。 * 网格交叉--筛选模型的最优参数 * 交叉就是不同参数得组合--筛选出最优的参数 * 所有的组合可能都会进行筛选 网格交叉.png GridSearchCV代码: import warnings warnings.filterwarnings('ignore') import numpy as np from sklearn.linear_model import LogisticRegression # GridSearchCV中,包含交叉验证! from sklearn.model_selection import GridSearchCV from sklearn import datasets from sklearn.metrics import accuracy_score#准确率 from sklearn.model_selection import StratifiedKFold from sklearn.model_selection import train_test_split X,y = datasets.load_wine(True) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2) %%time # pentaly :惩罚项,就是正则项,乘法系数,不能让系数太大,系数太大,容易过拟合(乘法,缩小) # tol tollerance 容忍误差,精确度一个意思 # C 惩罚项的系数(越大正则化越弱),相当于领回归的alpha(越大,正则化越强) # 参数组合种类:2*4*6 = 48种组合 hp = {'penalty':['l2','l1'],'tol':[0.001,0.005,0.0001,0.00001],'C':[0.1,0.5,1.0,5,10,1000],'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']} lr = LogisticRegression() # 参数组合,才能找到最优的参数组合 clf = GridSearchCV(lr,param_grid=hp,scoring='accuracy',cv = StratifiedKFold(n_splits=5,shuffle=True)) clf.fit(X_train,y_train) Wall time: 25.2 s GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=None, shuffle=True), error_score=nan, estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l2', random_state=None, solver='lbfgs', tol=0.0001, verbose=0, warm_start=False), iid='deprecated', n_jobs=None, param_grid={'C': [0.1, 0.5, 1.0, 5, 10, 1000], 'penalty': ['l2', 'l1'], 'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'tol': [0.001, 0.005, 0.0001, 1e-05]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring='accuracy', verbose=0) # 查看最合适参数 clf.best_params_ {'C': 0.5, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001} clf.best_score_ 0.9721674876847292 # estimator 估计者 == 模型 == 算法 == model # 机器学习, 数据挖掘中,所有的算法,都是估计! clf.best_estimator_ LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='auto', n_jobs=None, penalty='l1', random_state=None, solver='liblinear', tol=0.001, verbose=0, warm_start=False) 预测,直接使用GridSearchCV clf.predict(X_test) array([2, 1, 2, 1, 1, 1, 1, 2, 2, 1, 0, 2, 2, 1, 1, 0, 1, 1, 2, 0, 1, 1, 0, 1, 1, 2, 0, 2, 0, 0, 0, 1, 0, 2, 2, 1]) # 真实的测试数据的分数,低一些,说明:一定程度过拟合 clf.score(X_test,y_test) 0.9166666666666666 # 真实的测试数据的分数,低一些,说明:一定程度过拟合 clf.score(X_test,y_test) 0.9444444444444444 # 默认参数,预测,准确率,稍微低一些! lr = LogisticRegression() lr.fit(X_train,y_train) lr.score(X_test,y_test) 0.8888888888888888 (2)RandomizedSearchCv 参数:优化的有,惩罚项,正则化的类型 惩罚项就是减小系数,惩罚项就是正则项,防止过度拟合。 GridSearchCV要选的参数时固定的,RandomizedSearchCv选择的参数不是固定的,是随机生成的,筛选合适的参数。 通过n_inter来调整筛选的次数 系数的选择.png from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression from sklearn.model_selection import RandomizedSearchCV from scipy.stats import uniform iris = load_iris() logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0) distributions = dict(C=uniform(loc=0, scale=4), penalty=['l2', 'l1']) clf = RandomizedSearchCV(logistic, distributions, random_state=0) search = clf.fit(iris.data, iris.target) search.best_params_ #查看筛选的最佳的系数 RandomizedSearchCv代码: import warnings warnings.filterwarnings('ignore') import numpy as np from sklearn.model_selection import RandomizedSearchCV from sklearn import datasets from sklearn.linear_model import LogisticRegression from scipy.stats import uniform # 并没有生成,具体的数值 # 生成的数据,从0到4之间的uniform分布 u = uniform(loc = 0,scale=4) u X,y = datasets.load_wine(True) from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2) # solver 最基本的有梯度下降,随机梯度下降…… lr = LogisticRegression(tol = 1e-4,max_iter=200) # 参数,优化有:惩罚项、正则化的类型 hp = dict(C = uniform(loc = 0,scale = 4),penalty = ['l1','l2'], solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']) # 随机选择,不像,GridSearchCV(要选择的参数是固定的) clf = RandomizedSearchCV(lr,hp,n_iter=100)#要选的参数,不是固定的,随机生成,筛选合适,0.2416 clf.fit(X_train,y_train) display(clf.best_params_,clf.best_score_,clf.score(X_test,y_test)) {'C': 3.0556422672340506, 'penalty': 'l2', 'solver': 'newton-cg'} 0.9719211822660098 0.9444444444444444 (3)回归参数得调整 与分类的使用类似。 import numpy as np from sklearn.linear_model import Ridge from sklearn import datasets from sklearn.model_selection import train_test_split,GridSearchCV X,y = datasets.load_boston(True) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2) %%time hp = dict(alpha = np.arange(0.1,5,0.2), solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']) ridge = Ridge() clf = GridSearchCV(estimator=ridge,param_grid=hp,scoring='r2') clf.fit(X_train,y_train) Wall time: 12.2 s GridSearchCV(cv=None, error_score=nan, estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001), iid='deprecated', n_jobs=None, param_grid={'alpha': array([0.1, 0.3, 0.5, 0.7, 0.9, 1.1, 1.3, 1.5, 1.7, 1.9, 2.1, 2.3, 2.5, 2.7, 2.9, 3.1, 3.3, 3.5, 3.7, 3.9, 4.1, 4.3, 4.5, 4.7, 4.9]), 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}, pre_dispatch='2*n_jobs', refit=True, return_train_score=False, scoring='r2', verbose=0) clf.best_score_ 0.7175459507572871 clf.best_params_ {'alpha': 0.1, 'solver': 'auto'} clf.best_estimator_ Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False, random_state=None, solver='auto', tol=0.001) ridge = Ridge(alpha = 0.1) ridge.fit(X_train,y_train) ridge.score(X_test,y_test) 0.7150363841784935 from sklearn.metrics import r2_score y_ = ridge.predict(X_test) r2_score(y_test,y_) 0.7150363841784935 1 - ((y_test - y_)**2).sum()/(((y_test - y_test.mean())**2).sum()) 0.7150363841784935 四、SAG/SGD算法 SAG.png 五、模型的评估 模型评估就是对比真实值和预测值之间的差异 模型的评估.png 三种评价标准.png 有一些情况下:准确率是不足以评价模型好坏的。 举例:发明一个模型,诊断癌症,人群中是只有少数是患病的。如果模型诊断健康的人是百分之九十九,那么这个模型是没有意义的。 模型大多的情况下都是去找少数的例子。 银行贷款:大多数都是信用好的,那么算法大多数都是找到信用不好用的人。 所以就有了这些评估模型的方法: 一、分类问题模型评价指标: (1)二分类问题: 计算的方式.png 预测类别.png 预测类别2.png 准确率accuracy: image.png 精确率precision: image.png 默认情况下,精确率计算的是二分类的问题 召回率recall: image.png 召回率越高,说明模型想要找到正样本数据的能力越强。 当样本不均衡的时候,使用召回率越好 F-measure: image.png f值就是精确率和准确率的调和平均值 二分类代码: (2)多分类问题(模型评估) 使用精确率和召回率。 混淆矩阵 image.png image.png 多分类使用上面的精确率计算方法是会报错的:它只是适用于二分类的问题 使用两种方式计算精确率 image.png marco的计算方式:类别分别计算 再加和。 使用两种方式计算召回率: image.png image.png 当然多分类问题的精确率和召回率都是可以自己手动计算的,详见代码。 多分类代码: import warnings warnings.filterwarnings('ignore') from sklearn.metrics import recall_score,precision_score,accuracy_score from sklearn.model_selection import train_test_split import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn import datasets from sklearn.metrics import confusion_matrix import pandas as pd 二分类,精确率和召回率计算 X,y = datasets.load_iris(True) X2 = X[y!=0] y2 = y[y!=0] y2 X_train,X_test,y_train,y_test = train_test_split(X2,y2,test_size = 0.6) knn = KNeighborsClassifier(n_neighbors=2) knn.fit(X_train,y_train) y_pred = knn.predict(X_test) y_pred array([1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1]) confusion_matrix(y_test,y_pred) array([[27, 0], [10, 23]], dtype=int64) # 召回率 recall_score(y_test,y_pred) 1.0 26/(26 + 3) 0.896551724137931 # 精确率 precision_score(y_test,y_pred) 0.7297297297297297 accuracy_score(y_test,y_pred) 0.8333333333333334 26/(26 +2) 0.9285714285714286 多分类,精确率和召回率计算 X,y = datasets.load_wine(True) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.6) lr = LogisticRegression(C = 0.01) lr.fit(X_train,y_train) y_pred = lr.predict(X_test) # 混淆矩阵 confusion_matrix(y_test,y_pred) array([[27, 4, 1], [ 2, 41, 1], [ 1, 1, 29]], dtype=int64) pd.crosstab(index = y_test,columns=y_pred,margins=True,rownames=['真实'],colnames=['预测']) 多分类计算精确率 accuracy_score(y_test,y_pred) 0.9065420560747663 # 计算多分类的精确率 # 默认情况下,精确率计算的是二分类情况 precision_score(y_test,y_pred,average='micro') 0.9065420560747663 # 所有正确的样本和除以总样本数量 (27+41+29)/107 0.9065420560747663 precision_score(y_test,y_pred,average='macro') 0.9089294062646096 (27/30 + 41/46 + 29/31)/3 0.9089294062646096 多分类,计算召回率 recall_score(y_test,y_pred,average='micro') 0.9065420560747663 (27+41+29)/107 0.9065420560747663 recall_score(y_test,y_pred,average='macro') 0.9036840175953079 (27/32+41/44+29/31)/3 0.9036840175953079
作文投稿

12.模型选择、网格搜索(参数调整)、模型的评估指标一文由杰瑞文章网免费提供,本站为公益性作文网站,此作文为网上收集或网友提供,版权归原作者所有,如果侵犯了您的权益,请及时与我们联系,我们会立即删除!

杰瑞文章网友情提示:请不要直接抄作文用来交作业。你可以学习、借鉴、期待你写出更好的作文。

说说你对这篇作文的看法吧