网页抓取基础的重要性
网页抓取是获取网络数据的重要技术,广泛应用于数据采集、信息监控、价格比较等场景。掌握网页抓取技术,包括HTML解析、数据提取、反爬虫处理等,对于数据科学和自动化应用至关重要。
BeautifulSoup基础
1. 基本HTML解析
from bs4 import BeautifulSoup
import requests
from datetime import datetime
def beautifulsoup_basics():
"""BeautifulSoup基础操作"""
print("=== BeautifulSoup基础操作 ===")
# 模拟HTML内容
html_content = '''
<html>
<head>
<title>示例网页</title>
<meta charset="utf-8">
</head>
<body>
<h1>欢迎来到Python网页抓取</h1>
<div class="content">
<p>这是一个示例段落。</p>
<p>这是另一个段落。</p>
</div>
<ul>
<li><a href="https://python.org">Python官网</a></li>
<li><a href="https://docs.python.org">Python文档</a></li>
</ul>
</body>
</html>
'''
# 解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
# 获取标题
title = soup.find('title').text
print(f"网页标题: {title}")
# 获取所有段落
paragraphs = soup.find_all('p')
print("段落内容:")
for i, p in enumerate(paragraphs, 1):
print(f" 段落{i}: {p.text}")
# 获取所有链接
links = soup.find_all('a', href=True)
print("链接列表:")
for link in links:
print(f" {link.text}: {link['href']}")
# 获取特定class的元素
content_div = soup.find('div', class_='content')
if content_div:
print(f"内容区域: {content_div.text.strip()}")
beautifulsoup_basics()
2. 高级选择器使用
def advanced_selectors():
"""高级选择器使用"""
print("=== 高级选择器使用 ===")
# 模拟复杂HTML
html_content = '''
<html>
<body>
<div class="news-container">
<article class="news-item" data-id="1">
<h2 class="title">Python 3.8 发布</h2>
<p class="summary">新版本带来了许多改进</p>
<span class="date">2018-11-30</span>
<div class="tags">
<span class="tag">Python</span>
<span class="tag">编程</span>
</div>
</article>
<article class="news-item" data-id="2">
<h2 class="title">机器学习趋势</h2>
<p class="summary">AI技术快速发展</p>
<span class="date">2018-11-29</span>
<div class="tags">
<span class="tag">AI</span>
<span class="tag">机器学习</span>
</div>
</article>
</div>
</body>
</html>
'''
soup = BeautifulSoup(html_content, 'html.parser')
# CSS选择器
news_items = soup.select('.news-item')
print(f"找到 {len(news_items)} 个新闻项")
# 遍历新闻项
for item in news_items:
title = item.select_one('.title').text
summary = item.select_one('.summary').text
date = item.select_one('.date').text
tags = [tag.text for tag in item.select('.tag')]
print(f"\\n标题: {title}")
print(f"摘要: {summary}")
print(f"日期: {date}")
print(f"标签: {', '.join(tags)}")
# 属性选择器
items_with_id = soup.select('[data-id]')
print(f"\\n带ID的元素: {len(items_with_id)} 个")
# 组合选择器
python_news = soup.select('.news-item .tag:contains("Python")')
print(f"Python相关新闻: {len(python_news)} 个")
advanced_selectors()
requests库使用
1. 基本HTTP请求
def requests_basics():
"""requests基础使用"""
print("=== requests基础使用 ===")
# 模拟请求函数
def simulate_request(url, method='GET', **kwargs):
"""模拟HTTP请求"""
print(f"发送 {method} 请求到: {url}")
# 模拟响应
if 'python' in url.lower():
return {
'status_code': 200,
'text': '''
<html>
<head><title>Python官网</title></head>
<body>
<h1>欢迎来到Python</h1>
<p>Python是一种强大的编程语言</p>
</body>
</html>
''',
'headers': {'Content-Type': 'text/html; charset=utf-8'}
}
else:
return {
'status_code': 404,
'text': '<html><body><h1>页面未找到</h1></body></html>',
'headers': {'Content-Type': 'text/html'}
}
# 基本GET请求
response = simulate_request('https://python.org')
print(f"状态码: {response['status_code']}")
print(f"内容类型: {response['headers']['Content-Type']}")
# 解析响应内容
if response['status_code'] == 200:
soup = BeautifulSoup(response['text'], 'html.parser')
title = soup.find('title').text
print(f"页面标题: {title}")
# 带参数的请求
def simulate_search_request(query):
"""模拟搜索请求"""
print(f"搜索: {query}")
return {
'status_code': 200,
'text': f'<html><body><h1>搜索结果: {query}</h1></body></html>'
}
search_response = simulate_search_request('Python教程')
print(f"搜索状态: {search_response['status_code']}")
requests_basics()
数据提取和处理
1. 结构化数据提取
def structured_data_extraction():
"""结构化数据提取"""
print("=== 结构化数据提取 ===")
# 模拟电商网站HTML
ecommerce_html = '''
<html>
<body>
<div class="products">
<div class="product" data-id="1">
<h3 class="product-name">iPhone 12</h3>
<span class="price">¥6999</span>
<div class="rating">4.5</div>
<div class="reviews">128条评价</div>
</div>
<div class="product" data-id="2">
<h3 class="product-name">Samsung Galaxy S21</h3>
<span class="price">¥5999</span>
<div class="rating">4.3</div>
<div class="reviews">95条评价</div>
</div>
<div class="product" data-id="3">
<h3 class="product-name">Huawei P40</h3>
<span class="price">¥4999</span>
<div class="rating">4.7</div>
<div class="reviews">203条评价</div>
</div>
</div>
</body>
</html>
'''
soup = BeautifulSoup(ecommerce_html, 'html.parser')
# 提取产品信息
products = []
for product_div in soup.select('.product'):
product = {
'id': product_div.get('data-id'),
'name': product_div.select_one('.product-name').text,
'price': product_div.select_one('.price').text,
'rating': float(product_div.select_one('.rating').text),
'reviews': product_div.select_one('.reviews').text
}
products.append(product)
print("提取的产品信息:")
for product in products:
print(f" {product['name']}: {product['price']} (评分: {product['rating']})")
# 数据分析和排序
sorted_products = sorted(products, key=lambda x: x['rating'], reverse=True)
print("\\n按评分排序:")
for product in sorted_products:
print(f" {product['name']}: {product['rating']}分")
# 价格分析
prices = []
for product in products:
price_str = product['price'].replace('¥', '').replace(',', '')
try:
price = float(price_str)
prices.append(price)
except ValueError:
pass
if prices:
avg_price = sum(prices) / len(prices)
print(f"\\n平均价格: ¥{avg_price:.2f}")
print(f"最高价格: ¥{max(prices)}")
print(f"最低价格: ¥{min(prices)}")
structured_data_extraction()
实际应用案例
1. 新闻抓取系统
def news_scraping_system():
"""新闻抓取系统"""
print("=== 新闻抓取系统 ===")
# 模拟新闻网站
def simulate_news_fetch(url):
"""模拟新闻获取"""
print(f"抓取新闻: {url}")
# 模拟不同网站的HTML结构
if 'tech' in url:
return '''
<html>
<body>
<div class="article">
<h1>Python 3.8 发布新特性</h1>
<div class="content">Python 3.8 带来了海象运算符等新特性...</div>
<div class="meta">
<span class="author">技术编辑</span>
<span class="date">2018-11-30</span>
</div>
</div>
</body>
</html>
'''
elif 'sports' in url:
return '''
<html>
<body>
<div class="article">
<h1>世界杯决赛即将开始</h1>
<div class="content">世界杯决赛将在今晚举行...</div>
<div class="meta">
<span class="author">体育记者</span>
<span class="date">2018-11-30</span>
</div>
</div>
</body>
</html>
'''
else:
return '''
<html>
<body>
<div class="article">
<h1>经济新闻</h1>
<div class="content">股市今日表现良好...</div>
<div class="meta">
<span class="author">财经记者</span>
<span class="date">2018-11-30</span>
</div>
</div>
</body>
</html>
'''
def extract_news_data(html_content):
"""提取新闻数据"""
soup = BeautifulSoup(html_content, 'html.parser')
article = soup.find('div', class_='article')
if not article:
return None
return {
'title': article.find('h1').text.strip(),
'content': article.find('div', class_='content').text.strip(),
'author': article.find('span', class_='author').text.strip(),
'date': article.find('span', class_='date').text.strip(),
'scraped_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
# 模拟多个新闻源
news_urls = [
'https://tech.example.com/news1',
'https://sports.example.com/news1',
'https://finance.example.com/news1'
]
# 抓取新闻
all_news = []
for url in news_urls:
html_content = simulate_news_fetch(url)
news_data = extract_news_data(html_content)
if news_data:
all_news.append(news_data)
# 显示结果
print(f"成功抓取 {len(all_news)} 条新闻:")
for news in all_news:
print(f"\\n标题: {news['title']}")
print(f"作者: {news['author']}")
print(f"日期: {news['date']}")
print(f"抓取时间: {news['scraped_at']}")
print(f"内容: {news['content'][:100]}...")
news_scraping_system()
总结
掌握Python网页抓取基础是数据获取的关键:
- BeautifulSoup:理解HTML解析、选择器使用、数据提取
- requests库:掌握HTTP请求、会话管理、请求头设置
- 数据提取:学会结构化数据提取、文本处理、数据分析
- 实际应用:在新闻抓取、价格监控等场景中的应用
- 反爬虫处理:了解基本的反爬虫策略和应对方法
- 最佳实践:遵循网页抓取的最佳实践和法律法规
通过系统学习这些概念,你将能够构建出高效、可靠的网页抓取系统,为数据分析和应用开发提供数据支持。
转载请注明:周志洋的博客 » Python实用技巧-网页抓取基础


