python爬虫分享（爬取静态网站）

前尘小筑2024-10-182024-10-18

思维导图

+----------------------------+
|        导入库              |
+------------+---------------+
             |
             v
+------------+---------------+
|  定义爬虫类                |
|  - 名称                    |
|  - 起始URL列表             |
|  - 自定义设置              |
+------------+---------------+
             |
             v
+------------+---------------+
|  解析函数                  |
|  - 解析响应                |
|  - 确定保存路径            |
|  - 保存HTML内容            |
|  - 查找资源 (CSS, JS, 图片)|
|  - 递归抓取内部链接        |
+------------+---------------+
             |
             v
+------------+---------------+
|  保存资源函数              |
|  - 解析资源URL             |
|  - 确定保存路径            |
|  - 保存资源内容            |
+------------+---------------+
             |
             v
+------------+---------------+
|  运行爬虫                  |
|  - 设置输出目录            |
|  - 初始化爬虫              |
|  - 启动爬虫进程            |
+----------------------------+

详细

设置和初始化
导入必要的库，定义爬虫类，并设置初始URL列表和自定义设置。

import os
import scrapy
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
from urllib.parse import urljoin, urlparse, urlunparse

定义爬虫类

class WebsiteSpider(scrapy.Spider):
    # 爬虫的名称
    name = "mnchen"
    # 要爬取的初始URL列表
    start_urls = ['https://mnchen.cn']
    # 自定义设置
    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36',
        'DOWNLOAD_DELAY': 1,
        'RETRY_ENABLED': False,
        'LOG_LEVEL': 'ERROR'
    }

解析函数
处理初始响应并使用 BeautifulSoup 解析 HTML 内容。确定保存 HTML 内容的路径并保存，同时查找页面内的 CSS、JS 和图片资源，递归抓取内部链接。


# 解析函数，处理每个响应
def parse(self, response):
    soup = BeautifulSoup(response.text, 'html.parser')
    # 确定文件保存路径
    if response.url == self.start_urls[0]:
        filename = 'index.html'
    else:
        page_path = response.url.replace(self.start_urls[0], '').strip('/')
        filename = os.path.join(page_path, 'index.html')
    # 创建目录并保存HTML页面
    if filename != 'index.html':
        os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(response.text)
    # 下载页面中的CSS、JS和图片等资源
    for resource in soup.find_all(['link', 'script', 'img']):
        url = resource.get('href') or resource.get('src')
        if url:
            parsed_url = urlparse(url)
            sanitized_path = urlunparse(parsed_url._replace(query=''))
            absolute_url = urljoin(response.url, sanitized_path)
            yield scrapy.Request(absolute_url, callback=self.save_resource, meta={'resource_url': url})
    # 抓取内部链接，继续递归爬取
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith('/'):
            yield response.follow(href, self.parse)

保存资源函数
处理资源文件（CSS、JS、图片）的保存。

# 保存资源文件的函数
def save_resource(self, response):
    resource_url = response.meta['resource_url']
    parsed_url = urlparse(resource_url)
    sanitized_path = urlunparse(parsed_url._replace(query=''))
    path = sanitized_path.lstrip('/')
    if not path:
        path = response.url.split("/")[-1]
    # 确保文件有适当的后缀
    if not os.path.splitext(path)[1]:
        path += ".html"
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, 'wb') as f:
        f.write(response.body)

运行爬虫
设置输出目录，初始化并启动爬虫进程。

# 创建目录保存爬取的文件
output_dir = '2024-10-17'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 切换到新创建的目录
os.chdir(output_dir)

# 运行爬虫
process = CrawlerProcess()
process.crawl(WebsiteSpider)
process.start()

完整代码

import os
import scrapy
from bs4 import BeautifulSoup
from scrapy.crawler import CrawlerProcess
from urllib.parse import urljoin, urlparse, urlunparse

# 定义爬虫类
class WebsiteSpider(scrapy.Spider):
    # 爬虫的名称
    name = "mnchen"

    # 要爬取的初始URL列表
    start_urls = ['https://mnchen.cn']

    # 自定义设置
    custom_settings = {
        'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.114 Safari/537.36',
        'DOWNLOAD_DELAY': 1,
        'RETRY_ENABLED': False,
        'LOG_LEVEL': 'ERROR'
    }

    # 解析函数，处理每个响应
    def parse(self, response):
        soup = BeautifulSoup(response.text, 'html.parser')

        # 确定文件保存路径
        if response.url == self.start_urls[0]:
            filename = 'index.html'
        else:
            page_path = response.url.replace(self.start_urls[0], '').strip('/')
            filename = os.path.join(page_path, 'index.html')
        
        # 创建目录并保存HTML页面
        if filename != 'index.html':
            os.makedirs(os.path.dirname(filename), exist_ok=True)

        with open(filename, 'w', encoding='utf-8') as f:
            f.write(response.text)

        # 下载页面中的CSS、JS和图片等资源
        for resource in soup.find_all(['link', 'script', 'img']):
            url = resource.get('href') or resource.get('src')
            if url:
                parsed_url = urlparse(url)
                sanitized_path = urlunparse(parsed_url._replace(query=''))
                absolute_url = urljoin(response.url, sanitized_path)
                yield scrapy.Request(absolute_url, callback=self.save_resource, meta={'resource_url': url})

        # 抓取内部链接，继续递归爬取
        for link in soup.find_all('a'):
            href = link.get('href')
            if href and href.startswith('/'):
                yield response.follow(href, self.parse)

    # 保存资源文件的函数
    def save_resource(self, response):
        resource_url = response.meta['resource_url']
        parsed_url = urlparse(resource_url)
        sanitized_path = urlunparse(parsed_url._replace(query=''))
        path = sanitized_path.lstrip('/')

        if not path:
            path = response.url.split("/")[-1]

        # 确保文件有适当的后缀
        if not os.path.splitext(path)[1]:
            path += ".html"

        os.makedirs(os.path.dirname(path), exist_ok=True)
        with open(path, 'wb') as f:
            f.write(response.body)

# 创建目录保存爬取的文件
output_dir = '2024-10-17'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 切换到新创建的目录
os.chdir(output_dir)

# 运行爬虫
process = CrawlerProcess()
process.crawl(WebsiteSpider)
process.start()

免责声明

免责声明：

本博客/网站/教程中所分享的Python爬虫代码和相关内容仅供学习和研究使用。作者不对因使用本代码或内容而产生的任何直接或间接损失负责。使用者需自行承担使用本代码的风险。

特别提醒：

合法性：在使用Python爬虫时，请务必遵守相关法律法规，不要侵犯他人的合法权益。确保爬虫操作获得了目标网站的明确许可。
隐私保护：请勿使用爬虫采集、存储、传播他人的个人隐私信息，确保遵循数据隐私保护相关法律规定。
资源使用：合理使用计算资源，避免对目标网站造成负担或损害。请勿进行恶意爬取、滥用爬虫工具等行为。
责任自负：因使用者违反相关法律法规、侵犯他人权益或因使用本博客/网站/教程中的内容而造成的任何法律责任和经济损失，均由使用者自行承担。

本博客/网站/教程仅提供技术分享，作者保留对本免责声明的修改权。

感谢您的理解与配合。