from https://www.kaggle.com/learn/intermediate-machine-learning
下一篇 :【Kaggle】Intermediate Machine Learning(管道+交叉验证) 评分:mae误差 20998.83780 缺失值的处理: 评分:mae误差 16619.07644 分类变量处理方法: 遇见训练集和测试集的文字变量种类不一样: 查看文字特征里,有多少种变量值1. Introduction
import pandas as pd from sklearn.model_selection import train_test_split # Read the data X_full = pd.read_csv('../input/train.csv', index_col='Id') X_test_full = pd.read_csv('../input/test.csv', index_col='Id') # Obtain target and predictors y = X_full.SalePrice features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd'] X = X_full[features].copy() X_test = X_test_full[features].copy() # Break off validation set from training data X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0) from sklearn.ensemble import RandomForestRegressor # Define the models,定义了5种参数的随机森林模型 model_1 = RandomForestRegressor(n_estimators=50, random_state=0) model_2 = RandomForestRegressor(n_estimators=100, random_state=0) model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0) model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0) model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0) models = [model_1, model_2, model_3, model_4, model_5] from sklearn.metrics import mean_absolute_error # Function for comparing different models def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid): model.fit(X_t, y_t) preds = model.predict(X_v) return mean_absolute_error(y_v, preds) # 找出误差最小的模型 for i in range(0, len(models)): mae = score_model(models[i]) print("Model %d MAE: %d" % (i+1, mae)) best_model = models[2] my_model = best_model my_model.fit(X, y) # Generate test predictions preds_test = my_model.predict(X_test) # Save predictions in format used for competition scoring output = pd.DataFrame({'Id': X_test.index, 'SalePrice': preds_test}) output.to_csv('submission.csv', index=False)
2. Missing Values 缺失值处理
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] # Your code here # Fill in the lines below: drop columns in training and validation data reduced_X_train = X_train.drop(cols_with_missing,axis=1) reduced_X_valid = X_valid.drop(cols_with_missing,axis=1)
from sklearn.impute import SimpleImputer # Fill in the lines below: imputation help(SimpleImputer) imp = SimpleImputer()# 默认以均值进行填补 # imp = SimpleImputer(strategy="median") # 中位数填补 imputed_X_train = pd.DataFrame(imp.fit_transform(X_train))# 拟合,填补 imputed_X_valid = pd.DataFrame(imp.transform(X_valid))#填补 # Fill in the lines below: imputation removed column names; put them back imputed_X_train.columns = X_train.columns # 差值去除了特征名称,再填上 imputed_X_valid.columns = X_valid.columns
SimpleImputer
参考如下class SimpleImputer(_BaseImputer) | SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False) | | Imputation transformer for completing missing values. | | Read more in the :ref:`User Guide <impute>`. | | Parameters | ---------- | missing_values : number, string, np.nan (default) or None | The placeholder for the missing values. All occurrences of | `missing_values` will be imputed. | | strategy : string, default='mean' | The imputation strategy. | | - If "mean", then replace missing values using the mean along | each column. Can only be used with numeric data. | - If "median", then replace missing values using the median along | each column. Can only be used with numeric data. | - If "most_frequent", then replace missing using the most frequent | value along each column. Can be used with strings or numeric data. | - If "constant", then replace missing values with fill_value. Can be | used with strings or numeric data.
3. Categorical Variables 文字变量处理
# Get list of categorical variables,获取非数字类变量 s = (X_train.dtypes == 'object') object_cols = list(s[s].index) print("Categorical variables:") print(object_cols)
Categorical variables: ['Type', 'Method', 'Regionname'] # 特征名称
drop_X_train = X_train.select_dtypes(exclude=['object']) drop_X_valid = X_valid.select_dtypes(exclude=['object'])
from sklearn.preprocessing import LabelEncoder # Make copy to avoid changing original data label_X_train = X_train.copy() label_X_valid = X_valid.copy() # Apply label encoder to each column with categorical data label_encoder = LabelEncoder() for col in object_cols: label_X_train[col] = label_encoder.fit_transform(X_train[col]) label_X_valid[col] = label_encoder.transform(X_valid[col])
# Apply one-hot encoder to each column with categorical data OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False) OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols])) OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols])) # One-hot encoding removed index; put it back,放回idx OH_cols_train.index = X_train.index OH_cols_valid.index = X_valid.index # Remove categorical columns (will replace with one-hot encoding) num_X_train = X_train.drop(object_cols, axis=1) # 丢弃原有的文字列,只剩数字 num_X_valid = X_valid.drop(object_cols, axis=1) # Add one-hot encoded columns to numerical features # 数字列和编码后的文本特征列合并 OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1) OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)
# All categorical columns object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"] # Columns that can be safely label encoded good_label_cols = [col for col in object_cols if set(X_train[col]) == set(X_valid[col])] # Problematic columns that will be dropped from the dataset bad_label_cols = list(set(object_cols)-set(good_label_cols))
from sklearn.preprocessing import LabelEncoder # Drop categorical columns that will not be encoded label_X_train = X_train.drop(bad_label_cols, axis=1) label_X_valid = X_valid.drop(bad_label_cols, axis=1) # Apply label encoder labEncoder = LabelEncoder() for feature in set(good_label_cols): label_X_train[feature] = labEncoder.fit_transform(label_X_train[feature]) label_X_valid[feature] = labEncoder.transform(label_X_valid[feature])
# Get number of unique entries in each column with categorical data object_nunique = list(map(lambda col: X_train[col].nunique(), object_cols)) d = dict(zip(object_cols, object_nunique)) # Print number of unique entries by column, in ascending order sorted(d.items(), key=lambda x: x[1])
[('Street', 2), # 街道有2个不同的值 ('Utilities', 2), ('CentralAir', 2), 。。。 ('Exterior2nd', 16), ('Neighborhood', 25)] # 种数较多的不宜用one-hot, # 数据集扩大的很厉害,可以label-encoding,或丢弃
# Columns that will be one-hot encoded # 不同数值数 < 10 的特征进行 one-hot编码 low_cardinality_cols = [col for col in object_cols if X_train[col].nunique() < 10] # Columns that will be dropped from the dataset # 剩余的(两个set做差),丢弃 high_cardinality_cols = list(set(object_cols)-set(low_cardinality_cols))
from sklearn.preprocessing import OneHotEncoder # one_hot编码器 ohEnc = OneHotEncoder(handle_unknown='ignore', sparse=False) # 不同数值数 < 10 的特征one_hot编码 OH_X_train = pd.DataFrame(ohEnc.fit_transform(X_train[low_cardinality_cols])) OH_X_valid = pd.DataFrame(ohEnc.transform(X_valid[low_cardinality_cols])) # 编码后index丢失,再加上 OH_X_train.index = X_train.index OH_X_valid.index = X_valid.index # 数字特征(原数据丢弃文字特征,即得到) num_X_train = X_train.drop(object_cols, axis=1) num_X_valid = X_valid.drop(object_cols, axis=1) # 合并 数字特征 + one_hot编码(记得恢复index)后的文字特征(特征数值种类多的丢弃了) OH_X_train = pd.concat([OH_X_train, num_X_train], axis=1) OH_X_valid = pd.concat([OH_X_valid, num_X_valid], axis=1)
本网页所有视频内容由 imoviebox边看边下-网页视频下载, iurlBox网页地址收藏管理器 下载并得到。
ImovieBox网页视频下载器 下载地址: ImovieBox网页视频下载器-最新版本下载
本文章由: imapbox邮箱云存储,邮箱网盘,ImageBox 图片批量下载器,网页图片批量下载专家,网页图片批量下载器,获取到文章图片,imoviebox网页视频批量下载器,下载视频内容,为您提供.
阅读和此文章类似的: 全球云计算