Python实用技巧-机器学习入门详解

2018-12-11 | 阅读：次

机器学习入门的重要性

机器学习是人工智能的核心技术，通过算法让计算机从数据中学习模式，做出预测和决策。scikit-learn作为Python最流行的机器学习库，提供了丰富的算法和工具，是学习机器学习的理想起点。

scikit-learn基础

1. 基本分类

from sklearn.datasets import load_iris, make_classification
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import pandas as pd
from datetime import datetime

def basic_classification():
    """基本分类示例"""
    
    print("=== 基本分类示例 ===")
    
    # 加载鸢尾花数据集
    iris = load_iris()
    X, y = iris.data, iris.target
    
    print(f"数据集形状: {X.shape}")
    print(f"特征名称: {iris.feature_names}")
    print(f"目标类别: {iris.target_names}")
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    
    print(f"训练集大小: {X_train.shape}")
    print(f"测试集大小: {X_test.shape}")
    
    # 训练随机森林分类器
    rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_clf.fit(X_train, y_train)
    
    # 预测和评估
    y_pred = rf_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\\n随机森林准确率: {accuracy:.4f}")
    print("\\n分类报告:")
    print(classification_report(y_test, y_pred, target_names=iris.target_names))
    
    # 特征重要性
    feature_importance = rf_clf.feature_importances_
    print("\\n特征重要性:")
    for feature, importance in zip(iris.feature_names, feature_importance):
        print(f"{feature}: {importance:.4f}")

basic_classification()

2. 多种分类算法对比

def compare_classifiers():
    """多种分类算法对比"""
    
    print("=== 多种分类算法对比 ===")
    
    # 创建合成数据集
    X, y = make_classification(
        n_samples=1000, n_features=20, n_informative=15, 
        n_redundant=5, n_classes=3, random_state=42
    )
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # 定义分类器
    classifiers = {
        '随机森林': RandomForestClassifier(n_estimators=100, random_state=42),
        '逻辑回归': LogisticRegression(random_state=42, max_iter=1000),
        '支持向量机': SVC(random_state=42)
    }
    
    results = {}
    
    for name, clf in classifiers.items():
        # 训练模型
        clf.fit(X_train, y_train)
        
        # 预测
        y_pred = clf.predict(X_test)
        
        # 评估
        accuracy = accuracy_score(y_test, y_pred)
        
        # 交叉验证
        cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
        
        results[name] = {
            'accuracy': accuracy,
            'cv_mean': cv_scores.mean(),
            'cv_std': cv_scores.std()
        }
        
        print(f"\\n{name}:")
        print(f"  测试准确率: {accuracy:.4f}")
        print(f"  交叉验证: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    
    # 找出最佳模型
    best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
    print(f"\\n最佳模型: {best_model[0]} (准确率: {best_model[1]['accuracy']:.4f})")

compare_classifiers()

回归算法

1. 基本回归

from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def basic_regression():
    """基本回归示例"""
    
    print("=== 基本回归示例 ===")
    
    # 创建回归数据集
    X, y = make_regression(
        n_samples=1000, n_features=10, noise=0.1, random_state=42
    )
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # 训练线性回归模型
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    # 预测
    y_pred = lr.predict(X_test)
    
    # 评估指标
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"均方误差 (MSE): {mse:.4f}")
    print(f"均方根误差 (RMSE): {rmse:.4f}")
    print(f"平均绝对误差 (MAE): {mae:.4f}")
    print(f"决定系数 (R²): {r2:.4f}")
    
    # 特征系数
    print("\\n特征系数:")
    for i, coef in enumerate(lr.coef_):
        print(f"特征 {i}: {coef:.4f}")
    print(f"截距: {lr.intercept_:.4f}")

basic_regression()

实际应用案例

1. 房价预测

def house_price_prediction():
    """房价预测案例"""
    
    print("=== 房价预测案例 ===")
    
    # 创建房价数据
    np.random.seed(42)
    n_samples = 1000
    
    # 特征：面积、房间数、楼层、建造年份
    area = np.random.normal(120, 30, n_samples)
    rooms = np.random.randint(1, 6, n_samples)
    floor = np.random.randint(1, 21, n_samples)
    year = np.random.randint(1990, 2020, n_samples)
    
    # 目标：房价（基于特征计算）
    price = (area * 100 + rooms * 20000 + floor * 1000 + 
             (2020 - year) * 500 + np.random.normal(0, 50000, n_samples))
    price = np.maximum(price, 100000)  # 确保价格为正
    
    # 创建DataFrame
    df = pd.DataFrame({
        'area': area,
        'rooms': rooms,
        'floor': floor,
        'year': year,
        'price': price
    })
    
    print("数据概览:")
    print(df.head())
    print(f"\\n数据形状: {df.shape}")
    
    # 准备特征和目标
    X = df[['area', 'rooms', 'floor', 'year']]
    y = df['price']
    
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    
    # 训练模型
    rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_reg.fit(X_train, y_train)
    
    # 预测
    y_pred = rf_reg.predict(X_test)
    
    # 评估
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\\n模型性能:")
    print(f"均方根误差: {rmse:.2f}")
    print(f"决定系数: {r2:.4f}")
    
    # 特征重要性
    feature_importance = rf_reg.feature_importances_
    print(f"\\n特征重要性:")
    for feature, importance in zip(X.columns, feature_importance):
        print(f"{feature}: {importance:.4f}")
    
    # 预测示例
    sample_house = [[150, 3, 10, 2015]]  # 150平米，3室，10楼，2015年
    predicted_price = rf_reg.predict(sample_house)[0]
    print(f"\\n预测示例:")
    print(f"150平米，3室，10楼，2015年建造 -> 预测价格: {predicted_price:.2f}元")

house_price_prediction()

总结

掌握scikit-learn机器学习是数据科学的核心：

基础算法：理解分类、回归等基本算法
模型评估：掌握各种评估指标和交叉验证
特征工程：学会特征选择、预处理和缩放
实际应用：在房价预测、客户分类等场景中的应用
最佳实践：遵循机器学习的最佳实践
持续学习：不断探索新的算法和技术

通过系统学习这些概念，你将能够构建出有效的机器学习模型，解决实际的数据科学问题。

转载请注明：周志洋的博客 » Python实用技巧-机器学习入门