Python机器学习-Scikit-learn进阶详解

2019-05-05 | 阅读：次

Scikit-learn进阶的重要性

Scikit-learn是Python中最流行的机器学习库之一，提供了完整的机器学习工具链。掌握Scikit-learn的进阶特性，包括管道、特征工程、模型选择和超参数调优，对于构建高质量的机器学习模型至关重要。这些技能能够帮助开发者构建更加健壮、可维护和高效的机器学习系统。

机器学习管道详解

1. 基础管道使用

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

def pipeline_basics_demo():
    """机器学习管道基础演示"""
    print("=== Scikit-learn管道基础 ===")
    
    # 创建示例数据
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                              n_redundant=5, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"训练集形状: {X_train.shape}")
    print(f"测试集形状: {X_test.shape}")
    
    # 1. 基础管道
    print(f"\n1. 基础管道示例:")
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    # 训练和预测
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    # 评估
    accuracy = pipeline.score(X_test, y_test)
    print(f"   管道准确率: {accuracy:.4f}")
    
    # 2. 使用make_pipeline简化语法
    print(f"\n2. 简化管道语法:")
    simple_pipeline = make_pipeline(StandardScaler(), LogisticRegression(random_state=42))
    simple_pipeline.fit(X_train, y_train)
    simple_accuracy = simple_pipeline.score(X_test, y_test)
    print(f"   简化管道准确率: {simple_accuracy:.4f}")
    
    # 3. 管道步骤访问
    print(f"\n3. 管道步骤访问:")
    print(f"   管道步骤: {pipeline.named_steps.keys()}")
    print(f"   标准化器均值: {pipeline.named_steps['scaler'].mean_[:5]}")  # 显示前5个特征的均值
    print(f"   分类器特征重要性: {pipeline.named_steps['classifier'].feature_importances_[:5]}")  # 显示前5个特征的重要性
    
    return pipeline

pipeline = pipeline_basics_demo()

2. 高级管道技术

def advanced_pipeline_demo():
    """高级管道技术演示"""
    print("\n=== 高级管道技术 ===")
    
    # 创建回归数据
    X_reg, y_reg = make_regression(n_samples=500, n_features=10, noise=0.1, random_state=42)
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
    
    # 1. 多种预处理器的管道
    print(f"1. 多种预处理器管道:")
    preprocessing_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('robust_scaler', RobustScaler())  # 注意：通常不需要两个标准化器
    ])
    
    # 2. 条件管道（使用FunctionTransformer）
    from sklearn.preprocessing import FunctionTransformer
    from sklearn.compose import ColumnTransformer
    
    # 创建混合数据类型的数据
    np.random.seed(42)
    X_mixed = np.column_stack([
        np.random.randn(500, 5),  # 数值特征
        np.random.randint(0, 10, (500, 3))  # 分类特征
    ])
    
    # 对不同列应用不同的预处理
    column_transformer = ColumnTransformer([
        ('num', StandardScaler(), [0, 1, 2, 3, 4]),  # 对前5列应用标准化
        ('cat', MinMaxScaler(), [5, 6, 7])  # 对后3列应用最小最大标准化
    ])
    
    mixed_pipeline = Pipeline([
        ('preprocessor', column_transformer),
        ('regressor', LinearRegression())
    ])
    
    mixed_pipeline.fit(X_mixed, y_reg)
    mixed_score = mixed_pipeline.score(X_mixed, y_reg)
    print(f"   混合管道R²分数: {mixed_score:.4f}")
    
    # 3. 管道参数网格搜索
    from sklearn.model_selection import GridSearchCV
    
    print(f"\n2. 管道参数网格搜索:")
    param_grid = {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'scaler': [StandardScaler(), MinMaxScaler()]
    }
    
    # 创建基础管道
    search_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ])
    
    # 网格搜索
    grid_search = GridSearchCV(search_pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    print(f"   最佳参数: {grid_search.best_params_}")
    print(f"   最佳交叉验证分数: {grid_search.best_score_:.4f}")
    print(f"   测试集分数: {grid_search.score(X_test, y_test):.4f}")
    
    return grid_search

grid_search = advanced_pipeline_demo()

特征工程详解

1. 特征选择

def feature_selection_demo():
    """特征选择演示"""
    print("\n=== 特征选择技术 ===")
    
    # 创建示例数据
    X, y = make_classification(n_samples=1000, n_features=30, n_informative=10, 
                              n_redundant=10, n_repeated=10, random_state=42)
    
    print(f"原始特征数量: {X.shape[1]}")
    
    # 1. 单变量特征选择
    print(f"\n1. 单变量特征选择:")
    from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif
    
    # SelectKBest - 选择最好的K个特征
    selector_kbest = SelectKBest(f_classif, k=15)
    X_selected_kbest = selector_kbest.fit_transform(X, y)
    print(f"   SelectKBest选择特征数: {X_selected_kbest.shape[1]}")
    print(f"   选择的特征索引: {selector_kbest.get_support(indices=True)}")
    
    # SelectPercentile - 选择最好的百分比特征
    selector_percentile = SelectPercentile(f_classif, percentile=50)
    X_selected_percentile = selector_percentile.fit_transform(X, y)
    print(f"   SelectPercentile选择特征数: {X_selected_percentile.shape[1]}")
    
    # 2. 递归特征消除
    print(f"\n2. 递归特征消除:")
    from sklearn.feature_selection import RFE, RFECV
    
    # RFE - 递归特征消除
    estimator = RandomForestClassifier(n_estimators=50, random_state=42)
    rfe = RFE(estimator, n_features_to_select=15)
    X_selected_rfe = rfe.fit_transform(X, y)
    print(f"   RFE选择特征数: {X_selected_rfe.shape[1]}")
    print(f"   RFE特征排名: {rfe.ranking_[:10]}")  # 显示前10个特征的排名
    
    # RFECV - 交叉验证递归特征消除
    rfecv = RFECV(estimator, step=1, cv=3, scoring='accuracy')
    rfecv.fit(X, y)
    print(f"   RFECV最优特征数: {rfecv.n_features_}")
    print(f"   RFECV交叉验证分数: {rfecv.grid_scores_[rfecv.n_features_-1]:.4f}")
    
    # 3. 基于模型的特征选择
    print(f"\n3. 基于模型的特征选择:")
    from sklearn.feature_selection import SelectFromModel
    
    # 使用随机森林进行特征选择
    rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
    X_selected_model = rf_selector.fit_transform(X, y)
    print(f"   基于模型选择特征数: {X_selected_model.shape[1]}")
    print(f"   特征重要性阈值: {rf_selector.threshold_:.4f}")
    
    # 4. 特征选择效果比较
    print(f"\n4. 特征选择效果比较:")
    from sklearn.ensemble import GradientBoostingClassifier
    
    # 原始特征
    gb_original = GradientBoostingClassifier(random_state=42)
    scores_original = cross_val_score(gb_original, X, y, cv=3)
    print(f"   原始特征交叉验证分数: {scores_original.mean():.4f} (+/- {scores_original.std() * 2:.4f})")
    
    # 选择后的特征
    gb_selected = GradientBoostingClassifier(random_state=42)
    scores_selected = cross_val_score(gb_selected, X_selected_kbest, y, cv=3)
    print(f"   选择特征交叉验证分数: {scores_selected.mean():.4f} (+/- {scores_selected.std() * 2:.4f})")
    
    return selector_kbest, rfe, rf_selector

selectors = feature_selection_demo()

2. 特征变换

def feature_transformation_demo():
    """特征变换演示"""
    print("\n=== 特征变换技术 ===")
    
    # 创建示例数据
    np.random.seed(42)
    X = np.random.randn(200, 3)
    y = X[:, 0] + 2 * X[:, 1] + 0.5 * X[:, 2] + np.random.randn(200) * 0.1
    
    print(f"原始数据形状: {X.shape}")
    
    # 1. 多项式特征
    print(f"\n1. 多项式特征:")
    from sklearn.preprocessing import PolynomialFeatures
    
    # 二次多项式特征
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly.fit_transform(X)
    print(f"   原始特征数: {X.shape[1]}")
    print(f"   多项式特征数: {X_poly.shape[1]}")
    print(f"   特征名称: {poly.get_feature_names_out()}")
    
    # 2. 特征缩放
    print(f"\n2. 特征缩放:")
    from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer
    
    scalers = {
        'StandardScaler': StandardScaler(),
        'MinMaxScaler': MinMaxScaler(),
        'RobustScaler': RobustScaler(),
        'Normalizer': Normalizer()
    }
    
    for name, scaler in scalers.items():
        X_scaled = scaler.fit_transform(X)
        print(f"   {name}:")
        print(f"     均值: {X_scaled.mean(axis=0).round(3)}")
        print(f"     标准差: {X_scaled.std(axis=0).round(3)}")
        print(f"     最小值: {X_scaled.min(axis=0).round(3)}")
        print(f"     最大值: {X_scaled.max(axis=0).round(3)}")
    
    # 3. 特征编码
    print(f"\n3. 特征编码:")
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
    
    # 创建分类数据
    categories = np.array(['A', 'B', 'C', 'A', 'B'] * 40)  # 200个样本
    
    # 标签编码
    label_encoder = LabelEncoder()
    categories_labeled = label_encoder.fit_transform(categories)
    print(f"   标签编码结果: {categories_labeled[:10]}")
    print(f"   类别映射: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")
    
    # 独热编码
    onehot_encoder = OneHotEncoder(sparse_output=False)
    categories_onehot = onehot_encoder.fit_transform(categories.reshape(-1, 1))
    print(f"   独热编码形状: {categories_onehot.shape}")
    print(f"   独热编码前5行: \n{categories_onehot[:5]}")
    
    # 4. 特征分解
    print(f"\n4. 特征分解:")
    from sklearn.decomposition import PCA, FastICA
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    
    # 创建分类数据用于LDA
    X_class, y_class = make_classification(n_samples=200, n_features=10, n_classes=3, random_state=42)
    
    # PCA
    pca = PCA(n_components=3)
    X_pca = pca.fit_transform(X)
    print(f"   PCA解释方差比: {pca.explained_variance_ratio_}")
    print(f"   PCA累积解释方差比: {pca.explained_variance_ratio_.cumsum()}")
    
    # ICA
    ica = FastICA(n_components=3, random_state=42)
    X_ica = ica.fit_transform(X)
    print(f"   ICA混合矩阵形状: {ica.mixing_.shape}")
    
    # LDA
    lda = LinearDiscriminantAnalysis(n_components=2)
    X_lda = lda.fit_transform(X_class, y_class)
    print(f"   LDA解释方差比: {lda.explained_variance_ratio_}")
    
    return poly, pca, ica, lda

transformers = feature_transformation_demo()

模型选择和超参数调优

1. 网格搜索和随机搜索

def hyperparameter_tuning_demo():
    """超参数调优演示"""
    print("\n=== 超参数调优技术 ===")
    
    # 创建示例数据
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                              n_redundant=5, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 1. 网格搜索
    print(f"1. 网格搜索:")
    from sklearn.model_selection import GridSearchCV
    
    # 定义参数网格
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    # 网格搜索
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    print(f"   最佳参数: {grid_search.best_params_}")
    print(f"   最佳交叉验证分数: {grid_search.best_score_:.4f}")
    print(f"   测试集分数: {grid_search.score(X_test, y_test):.4f}")
    
    # 2. 随机搜索
    print(f"\n2. 随机搜索:")
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint, uniform
    
    # 定义参数分布
    param_distributions = {
        'n_estimators': randint(50, 300),
        'max_depth': [None] + list(range(5, 31)),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10),
        'max_features': ['sqrt', 'log2', None]
    }
    
    # 随机搜索
    random_search = RandomizedSearchCV(rf, param_distributions, n_iter=50, cv=3, 
                                       scoring='accuracy', random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    
    print(f"   最佳参数: {random_search.best_params_}")
    print(f"   最佳交叉验证分数: {random_search.best_score_:.4f}")
    print(f"   测试集分数: {random_search.score(X_test, y_test):.4f}")
    
    # 3. 贝叶斯优化（使用optuna，如果可用）
    print(f"\n3. 贝叶斯优化:")
    try:
        import optuna
        
        def objective(trial):
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'max_depth': trial.suggest_int('max_depth', 5, 30),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
                'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
            }
            
            model = RandomForestClassifier(**params, random_state=42)
            scores = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
            return scores.mean()
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=50)
        
        print(f"   最佳参数: {study.best_params}")
        print(f"   最佳分数: {study.best_value:.4f}")
        
        # 使用最佳参数训练模型
        best_model = RandomForestClassifier(**study.best_params, random_state=42)
        best_model.fit(X_train, y_train)
        best_score = best_model.score(X_test, y_test)
        print(f"   测试集分数: {best_score:.4f}")
        
    except ImportError:
        print("   Optuna未安装，跳过贝叶斯优化演示")
    
    return grid_search, random_search

search_results = hyperparameter_tuning_demo()

2. 交叉验证和性能评估

def cross_validation_demo():
    """交叉验证演示"""
    print("\n=== 交叉验证技术 ===")
    
    # 创建示例数据
    X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, 
                              n_redundant=5, random_state=42)
    
    # 1. 基础交叉验证
    print(f"1. 基础交叉验证:")
    from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
    
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # K折交叉验证
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    kfold_scores = cross_val_score(rf, X, y, cv=kfold, scoring='accuracy')
    print(f"   K折交叉验证分数: {kfold_scores.mean():.4f} (+/- {kfold_scores.std() * 2:.4f})")
    
    # 分层K折交叉验证
    skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    skfold_scores = cross_val_score(rf, X, y, cv=skfold, scoring='accuracy')
    print(f"   分层K折交叉验证分数: {skfold_scores.mean():.4f} (+/- {skfold_scores.std() * 2:.4f})")
    
    # 2. 时间序列交叉验证
    print(f"\n2. 时间序列交叉验证:")
    from sklearn.model_selection import TimeSeriesSplit
    
    # 创建时间序列数据
    np.random.seed(42)
    n_samples = 100
    X_ts = np.random.randn(n_samples, 10)
    y_ts = np.random.randn(n_samples)
    
    tscv = TimeSeriesSplit(n_splits=5)
    ts_scores = cross_val_score(rf, X_ts, y_ts, cv=tscv, scoring='neg_mean_squared_error')
    print(f"   时间序列交叉验证分数: {-ts_scores.mean():.4f} (+/- {ts_scores.std() * 2:.4f})")
    
    # 3. 学习曲线
    print(f"\n3. 学习曲线:")
    from sklearn.model_selection import learning_curve
    
    train_sizes, train_scores, val_scores = learning_curve(
        rf, X, y, cv=3, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        scoring='accuracy'
    )
    
    train_mean = train_scores.mean(axis=1)
    train_std = train_scores.std(axis=1)
    val_mean = val_scores.mean(axis=1)
    val_std = val_scores.std(axis=1)
    
    print(f"   训练集分数: {train_mean[-1]:.4f} (+/- {train_std[-1] * 2:.4f})")
    print(f"   验证集分数: {val_mean[-1]:.4f} (+/- {val_std[-1] * 2:.4f})")
    
    # 4. 验证曲线
    print(f"\n4. 验证曲线:")
    from sklearn.model_selection import validation_curve
    
    param_range = [10, 50, 100, 200, 300]
    train_scores_val, val_scores_val = validation_curve(
        rf, X, y, param_name='n_estimators', param_range=param_range,
        cv=3, scoring='accuracy', n_jobs=-1
    )
    
    train_mean_val = train_scores_val.mean(axis=1)
    val_mean_val = val_scores_val.mean(axis=1)
    
    print(f"   参数范围: {param_range}")
    print(f"   最佳参数值: {param_range[np.argmax(val_mean_val)]}")
    print(f"   最佳验证分数: {np.max(val_mean_val):.4f}")
    
    return kfold_scores, skfold_scores, ts_scores

cv_results = cross_validation_demo()

实际应用案例

1. 完整的机器学习项目

def complete_ml_project_demo():
    """完整机器学习项目演示"""
    print("\n=== 完整机器学习项目 ===")
    
    # 创建示例数据
    X, y = make_classification(n_samples=2000, n_features=30, n_informative=20, 
                              n_redundant=10, n_classes=3, random_state=42)
    
    # 数据分割
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    print(f"训练集大小: {X_train.shape}")
    print(f"测试集大小: {X_test.shape}")
    print(f"类别分布: {np.bincount(y_train)}")
    
    # 1. 创建完整的管道
    from sklearn.feature_selection import SelectKBest, f_classif
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV
    
    # 特征选择管道
    feature_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(f_classif, k=20))
    ])
    
    # 2. 多个模型比较
    models = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
        'SVM': SVC(random_state=42, probability=True)
    }
    
    results = {}
    for name, model in models.items():
        # 创建完整管道
        full_pipeline = Pipeline([
            ('preprocessing', feature_pipeline),
            ('classifier', model)
        ])
        
        # 交叉验证
        scores = cross_val_score(full_pipeline, X_train, y_train, cv=3, scoring='accuracy')
        results[name] = {
            'mean_score': scores.mean(),
            'std_score': scores.std(),
            'model': full_pipeline
        }
        print(f"   {name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")
    
    # 3. 选择最佳模型进行详细评估
    best_model_name = max(results.keys(), key=lambda x: results[x]['mean_score'])
    best_model = results[best_model_name]['model']
    
    print(f"\n最佳模型: {best_model_name}")
    
    # 训练最佳模型
    best_model.fit(X_train, y_train)
    
    # 详细评估
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)
    
    print(f"\n分类报告:")
    print(classification_report(y_test, y_pred))
    
    # 4. 特征重要性分析
    if hasattr(best_model.named_steps['classifier'], 'feature_importances_'):
        feature_importances = best_model.named_steps['classifier'].feature_importances_
        selected_features = best_model.named_steps['preprocessing'].named_steps['feature_selection'].get_support(indices=True)
        
        print(f"\n特征重要性 (前10个):")
        importance_indices = np.argsort(feature_importances)[::-1][:10]
        for i, idx in enumerate(importance_indices):
            original_feature_idx = selected_features[idx]
            print(f"   {i+1}. 特征 {original_feature_idx}: {feature_importances[idx]:.4f}")
    
    return best_model, results

final_model, all_results = complete_ml_project_demo()

总结

Scikit-learn进阶的关键要点：

机器学习管道：Pipeline、make_pipeline、ColumnTransformer的使用
特征工程：特征选择、特征变换、特征编码、特征分解
模型选择：网格搜索、随机搜索、贝叶斯优化
交叉验证：K折、分层K折、时间序列交叉验证
性能评估：学习曲线、验证曲线、分类报告
实际应用：完整项目流程、多模型比较、特征重要性分析

掌握这些Scikit-learn进阶技能，可以构建更加健壮、高效的机器学习模型，提高模型性能和可维护性。

转载请注明：周志洋的博客 » Python机器学习-Scikit-learn进阶详解