# 前言

前面我们使用 bs4 爬取了笔趣阁，并且实现了去除笔趣阁牛皮藓，但是我们实际上爬虫肯定是不会仅仅输出到终端，都是写入到文件里面，那么我们如何将爬取的数据写入到文本呢？

比如上一篇文章笔趣阁的小说：点击跳转

# 示例 demo，单章节爬取

# 将数据直接保存到 txt，不做任何数据格式处理。

	# 自定义文件保存路径，例如保存到桌面
	file_path = "C:\\Users\\你的电脑用户名\\Desktop\\文件名.txt"

	# 创建或打开 txt 文件，准备写入内容，注意编码格式哦，这里是 utf-8
	with open(file_path, "w", encoding="utf-8") as file:
	# 写入章节标题
	for h1 in h1_elements:
	file.write("本文标题: " + h1.text + "\n")

	# 写入正文内容，并且删除不需要的文本
	for p in p_elements:
	cleaned_text = p.text.replace("请收藏本站：https://www.bqgam.com。笔趣阁手机版：https://m.bqgam.com", "").replace("『点此报错』", "").replace("『加入书签』", "")
	file.write("本文正文: " + cleaned_text + "\n")

	# 成功后输出提示
	print(f"内容已写入 {file_path} 文件")

完整代码：

	import requests
	from bs4 import BeautifulSoup
	from urllib.request import Request, urlopen

	# 创建一个会话对象
	session = requests.session()

	# 设置请求的 URL
	host = "https://www.bqgam.com/index/11303/"

	# 设置请求的页面
	page = "https://www.bqgam.com/index/11303/1.html"

	def requestUrl(url):
	# 设置请求头
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
	}
	# 创建请求对象
	req = Request(url, headers=headers)
	# 打开 URL
	html = urlopen(req)
	# 读取 HTML 内容并解码为 utf-8 格式
	html = html.read().decode('utf-8')
	# 返回 HTML 内容
	return html

	# 这里下面开始的部分根据你爬取的实际网页标签进行修改
	def getPage(page):
	# 获取网页的 html 数据
	html = requestUrl(page)
	# 加载 html 文档为 soup 结构
	soup = BeautifulSoup(html, "html.parser")
	# 查找章节列表 <h1 class="wap_none">
	h1_elements = soup.find_all('h1', attrs={'class': 'wap_none'})
	# 正文 <div class="Readarea ReadAjax_content">
	p_elements = soup.find_all('div', attrs={'class': 'Readarea ReadAjax_content'})

	# 自定义文件保存路径，例如保存到桌面
	file_path = "C:\\Users\\xingchen\\Desktop\\novel_content.txt"

	# 创建或打开 txt 文件，准备写入内容
	with open(file_path, "w", encoding="utf-8") as file:
	# 写入章节标题
	for h1 in h1_elements:
	file.write("本文标题: " + h1.text + "\n")

	# 写入正文内容，删除不需要的文本
	for p in p_elements:
	cleaned_text = p.text.replace("请收藏本站：https://www.bqgam.com。笔趣阁手机版：https://m.bqgam.com", "").replace("『点此报错』", "").replace("『加入书签』", "")
	file.write("本文正文: " + cleaned_text + "\n")

	print(f"内容已写入 {file_path} 文件")

	getPage(page)

# 将爬取数据格式处理

将写入的文件自动换行显示

	# 添加换行符
	cleaned_text_with_breaks = cleaned_text.replace("。", "。\n").replace("！", "！\n").replace("？", "？\n")
	file.write("本文正文: " + cleaned_text_with_breaks + "\n\n")

完整代码

	import requests
	from bs4 import BeautifulSoup
	from urllib.request import Request, urlopen

	# 创建一个会话对象
	session = requests.session()

	# 设置请求的 URL
	host = "https://www.bqgam.com/index/11303/"

	# 设置请求的页面
	page = "https://www.bqgam.com/index/11303/1.html"

	def requestUrl(url):
	# 设置请求头
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36',
	}
	# 创建请求对象
	req = Request(url, headers=headers)
	# 打开 URL
	html = urlopen(req)
	# 读取 HTML 内容并解码为 utf-8 格式
	html = html.read().decode('utf-8')
	# 返回 HTML 内容
	return html

	# 这里下面开始的部分根据你爬取的实际网页标签进行修改
	def getPage(page):
	# 获取网页的 html 数据
	html = requestUrl(page)
	# 加载 html 文档为 soup 结构
	soup = BeautifulSoup(html, "html.parser")
	# 查找章节列表 <h1 class="wap_none">
	h1_elements = soup.find_all('h1', attrs={'class': 'wap_none'})
	# 正文 <div class="Readarea ReadAjax_content">
	p_elements = soup.find_all('div', attrs={'class': 'Readarea ReadAjax_content'})

	# 自定义文件保存路径，例如保存到桌面
	file_path = "C:\\Users\\xingchen\\Desktop\\novel_content.txt"

	# 创建或打开 txt 文件，准备写入内容
	with open(file_path, "w", encoding="utf-8") as file:
	# 写入章节标题
	for h1 in h1_elements:
	file.write(h1.text + "\n\n")

	# 写入正文内容，删除不需要的文本
	for p in p_elements:
	cleaned_text = p.text.replace("请收藏本站：https://www.bqgam.com。笔趣阁手机版：https://m.bqgam.com", "").replace("『点此报错』", "").replace("『加入书签』", "")
	# 添加换行符
	cleaned_text_with_breaks = cleaned_text.replace("。", "。\n").replace("！", "！\n").replace("？", "？\n")
	file.write(cleaned_text_with_breaks + "\n\n")

	print(f"内容已写入 {file_path} 文件")

	getPage(page)

# 进阶 - 爬取小说所有章节并存储到文本中

前面的代码我们都只能将其进行单章爬虫，我们如果想一口气将整本爬取下来，需要优化一下代码，以下是重构。

踩了很多坑，屎山代码有以下几个版本：

# 在项目文件夹外新建小说名 + 作者名.txt 文件并写入所有章节进去，没有自定义存放路径功能

	import os
	import requests
	from bs4 import BeautifulSoup

	# 目标网站 URL
	host = "https://www.bqgam.com/index/11303/"

	# 发送 HTTP 请求获取网页内容
	def requestUrl(url):
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'}
	html = requests.get(url, headers=headers)
	return html.text

	# 使用 BeautifulSoup 解析 HTML
	def getSoup(url):
	html = requestUrl(url)
	return BeautifulSoup(html, "html.parser")

	# 获取小说章节内容
	def getPage(page):
	soup = getSoup(page)

	# 查找章节标题
	h1 = soup.find("h1", attrs={'class': 'wap_none'})
	if h1 is None:
	return "Title not found"

	title = h1.text

	# 查找章节内容
	divText = soup.find(id="chaptercontent")

	if divText is None:
	return "Content not found"

	divText = divText.getText("\n")
	i = divText.rfind("请")
	body = title + "\n" + divText[:i]
	return body

	# 获取小说作者信息
	def getAuthor(soup):
	author_meta = soup.find("meta", {"property": "og:novel:author"})
	if author_meta:
	return author_meta["content"]
	else:
	return "Unknown Author"

	# 爬虫主程序
	def spider():
	soup = getSoup(host)

	# 获取小说信息
	fileName = soup.find(attrs={'class': 'info'}).h1.string
	author = getAuthor(soup)

	# 构建保存路径
	save_path = os.path.join(os.getcwd(), f"{fileName}_{author}.txt")

	# 写入小说内容到文件
	with open(save_path, "a", encoding='utf-8') as file:
	for a in soup.find(attrs={'class': 'listmain'}).find_all("a"):
	index = a["href"].rfind("/") + 1
	file.write(getPage(host + a["href"][index:]))

	# 执行爬虫程序
	spider()

# 在上一个版本动态获取小说名 + 作者名基础上，增加了自定义存放路径功能

	import os
	import requests
	from bs4 import BeautifulSoup

	# 目标网站 URL
	host = "https://www.bqgam.com/index/11303/"

	# 发送 HTTP 请求获取网页内容
	def requestUrl(url):
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'}
	html = requests.get(url, headers=headers)
	return html.text

	# 使用 BeautifulSoup 解析 HTML
	def getSoup(url):
	html = requestUrl(url)
	return BeautifulSoup(html, "html.parser")

	# 获取小说章节内容
	def getPage(page):
	soup = getSoup(page)

	# 查找章节标题
	h1 = soup.find("h1", attrs={'class': 'wap_none'})
	if h1 is None:
	return "Title not found"

	title = h1.text

	# 查找章节内容
	divText = soup.find(id="chaptercontent")

	if divText is None:
	return "Content not found"

	divText = divText.getText("\n")
	i = divText.rfind("请")
	body = title + "\n" + divText[:i]
	return body

	# 获取小说作者信息
	def getAuthor(soup):
	author_meta = soup.find("meta", {"property": "og:novel:author"})
	if author_meta:
	return author_meta["content"]
	else:
	return "Unknown Author"

	# 爬虫主程序，可以传入自定义保存路径
	def spider(custom_save_path=None):
	soup = getSoup(host)
	fileName = soup.find(attrs={'class': 'info'}).h1.string
	author = getAuthor(soup)

	if custom_save_path is None:
	# 如果没有自定义路径，则保存到当前工作目录下，文件名为 "小说名_作者名.txt"
	custom_save_path = os.path.join(os.getcwd(), f"{fileName}_{author}.txt")

	with open(custom_save_path, "a", encoding='utf-8') as file:
	for a in soup.find(attrs={'class': 'listmain'}).find_all("a"):
	index = a["href"].rfind("/") + 1
	file.write(getPage(host + a["href"][index:]))

	# 获取 fileName 和 author 的值
	soup = getSoup(host)
	fileName = soup.find(attrs={'class': 'info'}).h1.string
	author = getAuthor(soup)

	# 例子：将文件保存到 D 盘的 Downloads 目录下（绝对路径），并动态命名为 "小说名_作者名.txt"
	custom_save_path = os.path.join("D:\\Downloads\\", "{}_{}.txt".format(fileName, author))

	# 调用 spider 函数，传入自定义路径
	spider(custom_save_path)

# 再进阶，指定爬取某一章节内容

详解：比如 1-100，则只爬取并写入 1-100 的内容到文件 txt

	import os
	import requests
	from bs4 import BeautifulSoup

	# 目标网站 URL
	host = "https://www.bqgam.com/index/11303/"

	# 发送 HTTP 请求获取网页内容
	def requestUrl(url):
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'}
	html = requests.get(url, headers=headers)
	return html.text

	# 使用 BeautifulSoup 解析 HTML
	def getSoup(url):
	html = requestUrl(url)
	return BeautifulSoup(html, "html.parser")

	# 获取小说章节内容
	def getPage(page):
	soup = getSoup(page)

	# 查找章节标题
	h1 = soup.find("h1", attrs={'class': 'wap_none'})
	if h1 is None:
	return "Title not found"

	title = h1.text

	# 查找章节内容
	divText = soup.find(id="chaptercontent")

	if divText is None:
	return "Content not found"

	divText = divText.getText("\n")
	i = divText.rfind("请")
	body = title + "\n" + divText[:i]
	return body

	# 获取小说作者信息
	def getAuthor(soup):
	author_meta = soup.find("meta", {"property": "og:novel:author"})
	if author_meta:
	return author_meta["content"]
	else:
	return "Unknown Author"

	# 爬虫主程序，可以传入自定义保存路径和指定的章节范围
	def spider(custom_save_path=None, start_chapter=None, end_chapter=None):
	soup = getSoup(host)
	fileName = soup.find(attrs={'class': 'info'}).h1.string
	author = getAuthor(soup)

	if custom_save_path is None:
	# 如果没有自定义路径，则保存到当前工作目录下，文件名为 "小说名_作者名.txt"
	custom_save_path = os.path.join(os.getcwd(), f"{fileName}_{author}.txt")

	with open(custom_save_path, "a", encoding='utf-8') as file:
	chapters = soup.find(attrs={'class': 'listmain'}).find_all("a")

	# 如果指定了章节范围，则只爬取指定范围内的章节
	if start_chapter is not None and end_chapter is not None:
	chapters = chapters[start_chapter-1:end_chapter+1]

	for a in chapters:
	index = a["href"].rfind("/") + 1
	file.write(getPage(host + a["href"][index:]))

	# 获取 fileName 和 author 的值
	soup = getSoup(host)
	fileName = soup.find(attrs={'class': 'info'}).h1.string
	author = getAuthor(soup)

	# 例子：将文件保存到 D 盘的 Downloads 目录下（绝对路径），并动态命名为 "小说名_作者名.txt"
	custom_save_path = os.path.join("D:\\Downloads\\", "{}_{}.txt".format(fileName, author))

	# 调用 spider 函数，传入自定义路径和指定的章节范围（例如 1-100 章）
	spider(custom_save_path, start_chapter=1, end_chapter=100)

# 关于这个爬虫代码

上面几个示例 demo 一直使用的这本全球崩坏作为演示，其他书籍也是可以爬取的，自己更换为同一个站点的其他书籍 url 即可。

比如这本小说：宇宙职业选手

如果你要爬取其他笔趣阁或者网站的内容，请根据实际标签等更改源代码，以上源代码仅适用于： www.bqgam.com 这个站点。

最后免责声明：本站一切内容仅供学习交流使用，请勿用于商业用途，否则一切后果用户自负！谢谢。

Python学习笔记