# 前言
前面我们使用 bs4 爬取了笔趣阁,并且实现了去除笔趣阁牛皮藓,但是我们实际上爬虫肯定是不会仅仅输出到终端,都是写入到文件里面,那么我们如何将爬取的数据写入到文本呢?
比如上一篇文章笔趣阁的小说:点击跳转
# 示例 demo,单章节爬取
# 将数据直接保存到 txt,不做任何数据格式处理。
# 自定义文件保存路径,例如保存到桌面 | |
file_path = "C:\\Users\\你的电脑用户名\\Desktop\\文件名.txt" | |
# 创建或打开 txt 文件,准备写入内容,注意编码格式哦,这里是 utf-8 | |
with open(file_path, "w", encoding="utf-8") as file: | |
# 写入章节标题 | |
for h1 in h1_elements: | |
file.write("本文标题: " + h1.text + "\n") | |
# 写入正文内容,并且删除不需要的文本 | |
for p in p_elements: | |
cleaned_text = p.text.replace("请收藏本站:https://www.bqgam.com。笔趣阁手机版:https://m.bqgam.com", "").replace("『点此报错』", "").replace("『加入书签』", "") | |
file.write("本文正文: " + cleaned_text + "\n") | |
# 成功后输出提示 | |
print(f"内容已写入 {file_path} 文件") |
完整代码:
import requests | |
from bs4 import BeautifulSoup | |
from urllib.request import Request, urlopen | |
# 创建一个会话对象 | |
session = requests.session() | |
# 设置请求的 URL | |
host = "https://www.bqgam.com/index/11303/" | |
# 设置请求的页面 | |
page = "https://www.bqgam.com/index/11303/1.html" | |
def requestUrl(url): | |
# 设置请求头 | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36', | |
} | |
# 创建请求对象 | |
req = Request(url, headers=headers) | |
# 打开 URL | |
html = urlopen(req) | |
# 读取 HTML 内容并解码为 utf-8 格式 | |
html = html.read().decode('utf-8') | |
# 返回 HTML 内容 | |
return html | |
# 这里下面开始的部分根据你爬取的实际网页标签进行修改 | |
def getPage(page): | |
# 获取网页的 html 数据 | |
html = requestUrl(page) | |
# 加载 html 文档为 soup 结构 | |
soup = BeautifulSoup(html, "html.parser") | |
# 查找章节列表 <h1 class="wap_none"> | |
h1_elements = soup.find_all('h1', attrs={'class': 'wap_none'}) | |
# 正文 <div class="Readarea ReadAjax_content"> | |
p_elements = soup.find_all('div', attrs={'class': 'Readarea ReadAjax_content'}) | |
# 自定义文件保存路径,例如保存到桌面 | |
file_path = "C:\\Users\\xingchen\\Desktop\\novel_content.txt" | |
# 创建或打开 txt 文件,准备写入内容 | |
with open(file_path, "w", encoding="utf-8") as file: | |
# 写入章节标题 | |
for h1 in h1_elements: | |
file.write("本文标题: " + h1.text + "\n") | |
# 写入正文内容,删除不需要的文本 | |
for p in p_elements: | |
cleaned_text = p.text.replace("请收藏本站:https://www.bqgam.com。笔趣阁手机版:https://m.bqgam.com", "").replace("『点此报错』", "").replace("『加入书签』", "") | |
file.write("本文正文: " + cleaned_text + "\n") | |
print(f"内容已写入 {file_path} 文件") | |
getPage(page) |
# 将爬取数据格式处理
将写入的文件自动换行显示
# 添加换行符 | |
cleaned_text_with_breaks = cleaned_text.replace("。", "。\n").replace("!", "!\n").replace("?", "?\n") | |
file.write("本文正文: " + cleaned_text_with_breaks + "\n\n") |
完整代码
import requests | |
from bs4 import BeautifulSoup | |
from urllib.request import Request, urlopen | |
# 创建一个会话对象 | |
session = requests.session() | |
# 设置请求的 URL | |
host = "https://www.bqgam.com/index/11303/" | |
# 设置请求的页面 | |
page = "https://www.bqgam.com/index/11303/1.html" | |
def requestUrl(url): | |
# 设置请求头 | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36', | |
} | |
# 创建请求对象 | |
req = Request(url, headers=headers) | |
# 打开 URL | |
html = urlopen(req) | |
# 读取 HTML 内容并解码为 utf-8 格式 | |
html = html.read().decode('utf-8') | |
# 返回 HTML 内容 | |
return html | |
# 这里下面开始的部分根据你爬取的实际网页标签进行修改 | |
def getPage(page): | |
# 获取网页的 html 数据 | |
html = requestUrl(page) | |
# 加载 html 文档为 soup 结构 | |
soup = BeautifulSoup(html, "html.parser") | |
# 查找章节列表 <h1 class="wap_none"> | |
h1_elements = soup.find_all('h1', attrs={'class': 'wap_none'}) | |
# 正文 <div class="Readarea ReadAjax_content"> | |
p_elements = soup.find_all('div', attrs={'class': 'Readarea ReadAjax_content'}) | |
# 自定义文件保存路径,例如保存到桌面 | |
file_path = "C:\\Users\\xingchen\\Desktop\\novel_content.txt" | |
# 创建或打开 txt 文件,准备写入内容 | |
with open(file_path, "w", encoding="utf-8") as file: | |
# 写入章节标题 | |
for h1 in h1_elements: | |
file.write(h1.text + "\n\n") | |
# 写入正文内容,删除不需要的文本 | |
for p in p_elements: | |
cleaned_text = p.text.replace("请收藏本站:https://www.bqgam.com。笔趣阁手机版:https://m.bqgam.com", "").replace("『点此报错』", "").replace("『加入书签』", "") | |
# 添加换行符 | |
cleaned_text_with_breaks = cleaned_text.replace("。", "。\n").replace("!", "!\n").replace("?", "?\n") | |
file.write(cleaned_text_with_breaks + "\n\n") | |
print(f"内容已写入 {file_path} 文件") | |
getPage(page) |
# 进阶 - 爬取小说所有章节并存储到文本中
前面的代码我们都只能将其进行单章爬虫,我们如果想一口气将整本爬取下来,需要优化一下代码,以下是重构。
踩了很多坑,屎山代码有以下几个版本:
# 在项目文件夹外新建 小说名 + 作者名.txt 文件并写入所有章节进去,没有自定义存放路径功能
import os | |
import requests | |
from bs4 import BeautifulSoup | |
# 目标网站 URL | |
host = "https://www.bqgam.com/index/11303/" | |
# 发送 HTTP 请求获取网页内容 | |
def requestUrl(url): | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'} | |
html = requests.get(url, headers=headers) | |
return html.text | |
# 使用 BeautifulSoup 解析 HTML | |
def getSoup(url): | |
html = requestUrl(url) | |
return BeautifulSoup(html, "html.parser") | |
# 获取小说章节内容 | |
def getPage(page): | |
soup = getSoup(page) | |
# 查找章节标题 | |
h1 = soup.find("h1", attrs={'class': 'wap_none'}) | |
if h1 is None: | |
return "Title not found" | |
title = h1.text | |
# 查找章节内容 | |
divText = soup.find(id="chaptercontent") | |
if divText is None: | |
return "Content not found" | |
divText = divText.getText("\n") | |
i = divText.rfind("请") | |
body = title + "\n" + divText[:i] | |
return body | |
# 获取小说作者信息 | |
def getAuthor(soup): | |
author_meta = soup.find("meta", {"property": "og:novel:author"}) | |
if author_meta: | |
return author_meta["content"] | |
else: | |
return "Unknown Author" | |
# 爬虫主程序 | |
def spider(): | |
soup = getSoup(host) | |
# 获取小说信息 | |
fileName = soup.find(attrs={'class': 'info'}).h1.string | |
author = getAuthor(soup) | |
# 构建保存路径 | |
save_path = os.path.join(os.getcwd(), f"{fileName}_{author}.txt") | |
# 写入小说内容到文件 | |
with open(save_path, "a", encoding='utf-8') as file: | |
for a in soup.find(attrs={'class': 'listmain'}).find_all("a"): | |
index = a["href"].rfind("/") + 1 | |
file.write(getPage(host + a["href"][index:])) | |
# 执行爬虫程序 | |
spider() |
# 在上一个版本动态获取小说名 + 作者名基础上,增加了自定义存放路径功能
import os | |
import requests | |
from bs4 import BeautifulSoup | |
# 目标网站 URL | |
host = "https://www.bqgam.com/index/11303/" | |
# 发送 HTTP 请求获取网页内容 | |
def requestUrl(url): | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'} | |
html = requests.get(url, headers=headers) | |
return html.text | |
# 使用 BeautifulSoup 解析 HTML | |
def getSoup(url): | |
html = requestUrl(url) | |
return BeautifulSoup(html, "html.parser") | |
# 获取小说章节内容 | |
def getPage(page): | |
soup = getSoup(page) | |
# 查找章节标题 | |
h1 = soup.find("h1", attrs={'class': 'wap_none'}) | |
if h1 is None: | |
return "Title not found" | |
title = h1.text | |
# 查找章节内容 | |
divText = soup.find(id="chaptercontent") | |
if divText is None: | |
return "Content not found" | |
divText = divText.getText("\n") | |
i = divText.rfind("请") | |
body = title + "\n" + divText[:i] | |
return body | |
# 获取小说作者信息 | |
def getAuthor(soup): | |
author_meta = soup.find("meta", {"property": "og:novel:author"}) | |
if author_meta: | |
return author_meta["content"] | |
else: | |
return "Unknown Author" | |
# 爬虫主程序,可以传入自定义保存路径 | |
def spider(custom_save_path=None): | |
soup = getSoup(host) | |
fileName = soup.find(attrs={'class': 'info'}).h1.string | |
author = getAuthor(soup) | |
if custom_save_path is None: | |
# 如果没有自定义路径,则保存到当前工作目录下,文件名为 "小说名_作者名.txt" | |
custom_save_path = os.path.join(os.getcwd(), f"{fileName}_{author}.txt") | |
with open(custom_save_path, "a", encoding='utf-8') as file: | |
for a in soup.find(attrs={'class': 'listmain'}).find_all("a"): | |
index = a["href"].rfind("/") + 1 | |
file.write(getPage(host + a["href"][index:])) | |
# 获取 fileName 和 author 的值 | |
soup = getSoup(host) | |
fileName = soup.find(attrs={'class': 'info'}).h1.string | |
author = getAuthor(soup) | |
# 例子:将文件保存到 D 盘的 Downloads 目录下(绝对路径),并动态命名为 "小说名_作者名.txt" | |
custom_save_path = os.path.join("D:\\Downloads\\", "{}_{}.txt".format(fileName, author)) | |
# 调用 spider 函数,传入自定义路径 | |
spider(custom_save_path) |
# 再进阶,指定爬取某一章节内容
详解:比如 1-100,则只爬取并写入 1-100 的内容到文件 txt
import os | |
import requests | |
from bs4 import BeautifulSoup | |
# 目标网站 URL | |
host = "https://www.bqgam.com/index/11303/" | |
# 发送 HTTP 请求获取网页内容 | |
def requestUrl(url): | |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'} | |
html = requests.get(url, headers=headers) | |
return html.text | |
# 使用 BeautifulSoup 解析 HTML | |
def getSoup(url): | |
html = requestUrl(url) | |
return BeautifulSoup(html, "html.parser") | |
# 获取小说章节内容 | |
def getPage(page): | |
soup = getSoup(page) | |
# 查找章节标题 | |
h1 = soup.find("h1", attrs={'class': 'wap_none'}) | |
if h1 is None: | |
return "Title not found" | |
title = h1.text | |
# 查找章节内容 | |
divText = soup.find(id="chaptercontent") | |
if divText is None: | |
return "Content not found" | |
divText = divText.getText("\n") | |
i = divText.rfind("请") | |
body = title + "\n" + divText[:i] | |
return body | |
# 获取小说作者信息 | |
def getAuthor(soup): | |
author_meta = soup.find("meta", {"property": "og:novel:author"}) | |
if author_meta: | |
return author_meta["content"] | |
else: | |
return "Unknown Author" | |
# 爬虫主程序,可以传入自定义保存路径和指定的章节范围 | |
def spider(custom_save_path=None, start_chapter=None, end_chapter=None): | |
soup = getSoup(host) | |
fileName = soup.find(attrs={'class': 'info'}).h1.string | |
author = getAuthor(soup) | |
if custom_save_path is None: | |
# 如果没有自定义路径,则保存到当前工作目录下,文件名为 "小说名_作者名.txt" | |
custom_save_path = os.path.join(os.getcwd(), f"{fileName}_{author}.txt") | |
with open(custom_save_path, "a", encoding='utf-8') as file: | |
chapters = soup.find(attrs={'class': 'listmain'}).find_all("a") | |
# 如果指定了章节范围,则只爬取指定范围内的章节 | |
if start_chapter is not None and end_chapter is not None: | |
chapters = chapters[start_chapter-1:end_chapter+1] | |
for a in chapters: | |
index = a["href"].rfind("/") + 1 | |
file.write(getPage(host + a["href"][index:])) | |
# 获取 fileName 和 author 的值 | |
soup = getSoup(host) | |
fileName = soup.find(attrs={'class': 'info'}).h1.string | |
author = getAuthor(soup) | |
# 例子:将文件保存到 D 盘的 Downloads 目录下(绝对路径),并动态命名为 "小说名_作者名.txt" | |
custom_save_path = os.path.join("D:\\Downloads\\", "{}_{}.txt".format(fileName, author)) | |
# 调用 spider 函数,传入自定义路径和指定的章节范围(例如 1-100 章) | |
spider(custom_save_path, start_chapter=1, end_chapter=100) |
# 关于这个爬虫代码
上面几个示例 demo 一直使用的这本全球崩坏作为演示,其他书籍也是可以爬取的,自己更换为同一个站点的其他书籍 url 即可。
比如这本小说:宇宙职业选手
如果你要爬取其他笔趣阁或者网站的内容,请根据实际标签等更改源代码,以上源代码仅适用于: www.bqgam.com 这个站点。
最后免责声明:本站一切内容仅供学习交流使用,请勿用于商业用途,否则一切后果用户自负!谢谢。