Python实用技巧-网页抓取基础详解

2018-11-30 | 阅读：次

网页抓取基础的重要性

网页抓取是获取网络数据的重要技术，广泛应用于数据采集、信息监控、价格比较等场景。掌握网页抓取技术，包括HTML解析、数据提取、反爬虫处理等，对于数据科学和自动化应用至关重要。

BeautifulSoup基础

1. 基本HTML解析

from bs4 import BeautifulSoup
import requests
from datetime import datetime

def beautifulsoup_basics():
    """BeautifulSoup基础操作"""
    
    print("=== BeautifulSoup基础操作 ===")
    
    # 模拟HTML内容
    html_content = '''
    <html>
        <head>
            <title>示例网页</title>
            <meta charset="utf-8">
        </head>
        <body>
            <h1>欢迎来到Python网页抓取</h1>
            <div class="content">
                <p>这是一个示例段落。</p>
                <p>这是另一个段落。</p>
            </div>
            <ul>
                <li><a href="https://python.org">Python官网</a></li>
                <li><a href="https://docs.python.org">Python文档</a></li>
            </ul>
        </body>
    </html>
    '''
    
    # 解析HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # 获取标题
    title = soup.find('title').text
    print(f"网页标题: {title}")
    
    # 获取所有段落
    paragraphs = soup.find_all('p')
    print("段落内容:")
    for i, p in enumerate(paragraphs, 1):
        print(f"  段落{i}: {p.text}")
    
    # 获取所有链接
    links = soup.find_all('a', href=True)
    print("链接列表:")
    for link in links:
        print(f"  {link.text}: {link['href']}")
    
    # 获取特定class的元素
    content_div = soup.find('div', class_='content')
    if content_div:
        print(f"内容区域: {content_div.text.strip()}")

beautifulsoup_basics()

2. 高级选择器使用

def advanced_selectors():
    """高级选择器使用"""
    
    print("=== 高级选择器使用 ===")
    
    # 模拟复杂HTML
    html_content = '''
    <html>
        <body>
            <div class="news-container">
                <article class="news-item" data-id="1">
                    <h2 class="title">Python 3.8 发布</h2>
                    <p class="summary">新版本带来了许多改进</p>
                    <span class="date">2018-11-30</span>
                    <div class="tags">
                        <span class="tag">Python</span>
                        <span class="tag">编程</span>
                    </div>
                </article>
                <article class="news-item" data-id="2">
                    <h2 class="title">机器学习趋势</h2>
                    <p class="summary">AI技术快速发展</p>
                    <span class="date">2018-11-29</span>
                    <div class="tags">
                        <span class="tag">AI</span>
                        <span class="tag">机器学习</span>
                    </div>
                </article>
            </div>
        </body>
    </html>
    '''
    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # CSS选择器
    news_items = soup.select('.news-item')
    print(f"找到 {len(news_items)} 个新闻项")
    
    # 遍历新闻项
    for item in news_items:
        title = item.select_one('.title').text
        summary = item.select_one('.summary').text
        date = item.select_one('.date').text
        tags = [tag.text for tag in item.select('.tag')]
        
        print(f"\\n标题: {title}")
        print(f"摘要: {summary}")
        print(f"日期: {date}")
        print(f"标签: {', '.join(tags)}")
    
    # 属性选择器
    items_with_id = soup.select('[data-id]')
    print(f"\\n带ID的元素: {len(items_with_id)} 个")
    
    # 组合选择器
    python_news = soup.select('.news-item .tag:contains("Python")')
    print(f"Python相关新闻: {len(python_news)} 个")

advanced_selectors()

requests库使用

1. 基本HTTP请求

def requests_basics():
    """requests基础使用"""
    
    print("=== requests基础使用 ===")
    
    # 模拟请求函数
    def simulate_request(url, method='GET', **kwargs):
        """模拟HTTP请求"""
        print(f"发送 {method} 请求到: {url}")
        
        # 模拟响应
        if 'python' in url.lower():
            return {
                'status_code': 200,
                'text': '''
                <html>
                    <head><title>Python官网</title></head>
                    <body>
                        <h1>欢迎来到Python</h1>
                        <p>Python是一种强大的编程语言</p>
                    </body>
                </html>
                ''',
                'headers': {'Content-Type': 'text/html; charset=utf-8'}
            }
        else:
            return {
                'status_code': 404,
                'text': '<html><body><h1>页面未找到</h1></body></html>',
                'headers': {'Content-Type': 'text/html'}
            }
    
    # 基本GET请求
    response = simulate_request('https://python.org')
    print(f"状态码: {response['status_code']}")
    print(f"内容类型: {response['headers']['Content-Type']}")
    
    # 解析响应内容
    if response['status_code'] == 200:
        soup = BeautifulSoup(response['text'], 'html.parser')
        title = soup.find('title').text
        print(f"页面标题: {title}")
    
    # 带参数的请求
    def simulate_search_request(query):
        """模拟搜索请求"""
        print(f"搜索: {query}")
        return {
            'status_code': 200,
            'text': f'<html><body><h1>搜索结果: {query}</h1></body></html>'
        }
    
    search_response = simulate_search_request('Python教程')
    print(f"搜索状态: {search_response['status_code']}")

requests_basics()

数据提取和处理

1. 结构化数据提取

def structured_data_extraction():
    """结构化数据提取"""
    
    print("=== 结构化数据提取 ===")
    
    # 模拟电商网站HTML
    ecommerce_html = '''
    <html>
        <body>
            <div class="products">
                <div class="product" data-id="1">
                    <h3 class="product-name">iPhone 12</h3>
                    <span class="price">¥6999</span>
                    <div class="rating">4.5</div>
                    <div class="reviews">128条评价</div>
                </div>
                <div class="product" data-id="2">
                    <h3 class="product-name">Samsung Galaxy S21</h3>
                    <span class="price">¥5999</span>
                    <div class="rating">4.3</div>
                    <div class="reviews">95条评价</div>
                </div>
                <div class="product" data-id="3">
                    <h3 class="product-name">Huawei P40</h3>
                    <span class="price">¥4999</span>
                    <div class="rating">4.7</div>
                    <div class="reviews">203条评价</div>
                </div>
            </div>
        </body>
    </html>
    '''
    
    soup = BeautifulSoup(ecommerce_html, 'html.parser')
    
    # 提取产品信息
    products = []
    for product_div in soup.select('.product'):
        product = {
            'id': product_div.get('data-id'),
            'name': product_div.select_one('.product-name').text,
            'price': product_div.select_one('.price').text,
            'rating': float(product_div.select_one('.rating').text),
            'reviews': product_div.select_one('.reviews').text
        }
        products.append(product)
    
    print("提取的产品信息:")
    for product in products:
        print(f"  {product['name']}: {product['price']} (评分: {product['rating']})")
    
    # 数据分析和排序
    sorted_products = sorted(products, key=lambda x: x['rating'], reverse=True)
    print("\\n按评分排序:")
    for product in sorted_products:
        print(f"  {product['name']}: {product['rating']}分")
    
    # 价格分析
    prices = []
    for product in products:
        price_str = product['price'].replace('¥', '').replace(',', '')
        try:
            price = float(price_str)
            prices.append(price)
        except ValueError:
            pass
    
    if prices:
        avg_price = sum(prices) / len(prices)
        print(f"\\n平均价格: ¥{avg_price:.2f}")
        print(f"最高价格: ¥{max(prices)}")
        print(f"最低价格: ¥{min(prices)}")

structured_data_extraction()

实际应用案例

1. 新闻抓取系统

def news_scraping_system():
    """新闻抓取系统"""
    
    print("=== 新闻抓取系统 ===")
    
    # 模拟新闻网站
    def simulate_news_fetch(url):
        """模拟新闻获取"""
        print(f"抓取新闻: {url}")
        
        # 模拟不同网站的HTML结构
        if 'tech' in url:
            return '''
            <html>
                <body>
                    <div class="article">
                        <h1>Python 3.8 发布新特性</h1>
                        <div class="content">Python 3.8 带来了海象运算符等新特性...</div>
                        <div class="meta">
                            <span class="author">技术编辑</span>
                            <span class="date">2018-11-30</span>
                        </div>
                    </div>
                </body>
            </html>
            '''
        elif 'sports' in url:
            return '''
            <html>
                <body>
                    <div class="article">
                        <h1>世界杯决赛即将开始</h1>
                        <div class="content">世界杯决赛将在今晚举行...</div>
                        <div class="meta">
                            <span class="author">体育记者</span>
                            <span class="date">2018-11-30</span>
                        </div>
                    </div>
                </body>
            </html>
            '''
        else:
            return '''
            <html>
                <body>
                    <div class="article">
                        <h1>经济新闻</h1>
                        <div class="content">股市今日表现良好...</div>
                        <div class="meta">
                            <span class="author">财经记者</span>
                            <span class="date">2018-11-30</span>
                        </div>
                    </div>
                </body>
            </html>
            '''
    
    def extract_news_data(html_content):
        """提取新闻数据"""
        soup = BeautifulSoup(html_content, 'html.parser')
        
        article = soup.find('div', class_='article')
        if not article:
            return None
        
        return {
            'title': article.find('h1').text.strip(),
            'content': article.find('div', class_='content').text.strip(),
            'author': article.find('span', class_='author').text.strip(),
            'date': article.find('span', class_='date').text.strip(),
            'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }
    
    # 模拟多个新闻源
    news_urls = [
        'https://tech.example.com/news1',
        'https://sports.example.com/news1',
        'https://finance.example.com/news1'
    ]
    
    # 抓取新闻
    all_news = []
    for url in news_urls:
        html_content = simulate_news_fetch(url)
        news_data = extract_news_data(html_content)
        if news_data:
            all_news.append(news_data)
    
    # 显示结果
    print(f"成功抓取 {len(all_news)} 条新闻:")
    for news in all_news:
        print(f"\\n标题: {news['title']}")
        print(f"作者: {news['author']}")
        print(f"日期: {news['date']}")
        print(f"抓取时间: {news['scraped_at']}")
        print(f"内容: {news['content'][:100]}...")

news_scraping_system()

总结

掌握Python网页抓取基础是数据获取的关键：

BeautifulSoup：理解HTML解析、选择器使用、数据提取
requests库：掌握HTTP请求、会话管理、请求头设置
数据提取：学会结构化数据提取、文本处理、数据分析
实际应用：在新闻抓取、价格监控等场景中的应用
反爬虫处理：了解基本的反爬虫策略和应对方法
最佳实践：遵循网页抓取的最佳实践和法律法规

通过系统学习这些概念，你将能够构建出高效、可靠的网页抓取系统，为数据分析和应用开发提供数据支持。

转载请注明：周志洋的博客 » Python实用技巧-网页抓取基础