使用 sklearn 的 pipeline 搭建机器学习的流程 还可以尝试别的模型 以上只是粗略的大体框架,还有很多细节,大家多指教! 我的ImapBox博客地址 https://michael.blog.csdn.net/ 长按或扫码关注我的公众号(Michael阿明),一起加油、一起学习进步!
本文例子为 [Kesci] 新人赛 · 员工满意度预测
参考 [Hands On ML] 2. 一个完整的机器学习项目(加州房价预测)1. 导入工具包
import numpy as np import pandas as pd %matplotlib inline import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.model_selection import StratifiedShuffleSplit from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelBinarizer from sklearn.base import BaseEstimator, TransformerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.pipeline import FeatureUnion from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score
2. 读取数据
data = pd.read_csv("../competition/Employee_Satisfaction/train.csv") test = pd.read_csv("../competition/Employee_Satisfaction/test.csv") data.columns
Index(['id', 'last_evaluation', 'number_project', 'average_monthly_hours', 'time_spend_company', 'Work_accident', 'package', 'promotion_last_5years', 'division', 'salary', 'satisfaction_level'], dtype='object')
y = data['satisfaction_level'] X = data.drop(['satisfaction_level'], axis=1)
3. 数字特征、文字特征分离
def num_cat_splitor(X): s = (X.dtypes == 'object') object_cols = list(s[s].index) # object_cols # ['package', 'division', 'salary'] num_cols = list(set(X.columns) - set(object_cols)) # num_cols # ['Work_accident', 'time_spend_company', 'promotion_last_5years', 'id', # 'average_monthly_hours', 'last_evaluation', 'number_project'] return num_cols, object_cols num_cols, object_cols = num_cat_splitor(X) # print(num_cols) # print(object_cols) # X[object_cols].values
class DataFrameSelector(BaseEstimator, TransformerMixin): def __init__(self, attribute_names): self.attribute_names = attribute_names def fit(self, X, y=None): return self def transform(self, X): return X[self.attribute_names].values
4. 数据处理Pipeline
num_pipeline = Pipeline([ ('selector', DataFrameSelector(num_cols)), ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ])
cat_pipeline = Pipeline([ ('selector', DataFrameSelector(object_cols)), ('cat_encoder', OneHotEncoder(sparse=False)), ])
full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", num_pipeline), ("cat_pipeline", cat_pipeline), ]) X_prepared = full_pipeline.fit_transform(X)
5. 尝试不同的模型
from sklearn.ensemble import RandomForestRegressor forest_reg = RandomForestRegressor() forest_scores = cross_val_score(forest_reg,X_prepared,y, scoring='neg_mean_squared_error',cv=3) forest_rmse_scores = np.sqrt(-forest_scores) print(forest_rmse_scores) print(forest_rmse_scores.mean()) print(forest_rmse_scores.std())
6. 参数搜索
param_grid = [ {'n_estimators' : [3,10,30,50,80],'max_features':[2,4,6,8]}, {'bootstrap':[False], 'n_estimators' : [3,10],'max_features':[2,3,4]}, ] forest_reg = RandomForestRegressor() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error') grid_search.fit(X_prepared,y)
grid_search.best_params_
grid_search.best_estimator_
cv_result = grid_search.cv_results_ for mean_score, params in zip(cv_result['mean_test_score'], cv_result['params']): print(np.sqrt(-mean_score), params)
0.2129252723367584 {'max_features': 2, 'n_estimators': 3} 0.19276874697889504 {'max_features': 2, 'n_estimators': 10} 0.1865548358477794 {'max_features': 2, 'n_estimators': 30} .......
7. 特征重要性筛选
feature_importances = grid_search.best_estimator_.feature_importances_
k = 3 def indices_of_top_k(arr, k): return np.sort(np.argpartition(np.array(arr), -k)[-k:]) class TopFeatureSelector(BaseEstimator, TransformerMixin): def __init__(self, feature_importances, k): self.feature_importances = feature_importances self.k = k def fit(self, X, y=None): self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k) return self def transform(self, X): return X[:, self.feature_indices_]
8. 最终完整Pipeline
prepare_select_and_predict_pipeline = Pipeline([ ('preparation', full_pipeline), ('feature_selection', TopFeatureSelector(feature_importances, k)), ('forst_reg', RandomForestRegressor()) ])
param_grid = [{ 'preparation__num_pipeline__imputer__strategy': ['mean', 'median', 'most_frequent'], 'feature_selection__k': list(range(5, len(feature_importances) + 1)), 'forst_reg__n_estimators' : [200,250,300,310,330], 'forst_reg__max_features':[2,4,6,8] }] grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=10, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search_prep.fit(X,y) grid_search_prep.best_params_ final_model = grid_search_prep.best_estimator_
y_pred_test = final_model.predict(test) result = pd.DataFrame() result['id'] = test['id'] result['satisfaction_level'] = y_pred_test result.to_csv('rf_ML_pipeline.csv',index=False)
本网页所有视频内容由 imoviebox边看边下-网页视频下载, iurlBox网页地址收藏管理器 下载并得到。
ImovieBox网页视频下载器 下载地址: ImovieBox网页视频下载器-最新版本下载
本文章由: imapbox邮箱云存储,邮箱网盘,ImageBox 图片批量下载器,网页图片批量下载专家,网页图片批量下载器,获取到文章图片,imoviebox网页视频批量下载器,下载视频内容,为您提供.
阅读和此文章类似的: 全球云计算