# Python 爬虫之 bs4，非常详细

bs4 全名 BeautifulSoup，是编写 python 爬虫常用库之一，主要用来解析 html 标签。

# 前传安装 bs4

使用国内源快速安装 bs4

pip install bs4 -i https://pypi.tuna.tsinghua.edu.cn/simple

# 一、初始化

	from bs4 import BeautifulSoup

	soup = BeautifulSoup("<html>A Html Text</html>", "html.parser")

两个参数：第一个参数是要解析的 html 文本，第二个参数是使用那种解析器，对于 HTML 来讲就是 html.parser，这个是 bs4 自带的解析器。

如果一段 HTML 或 XML 文档格式不正确的话，那么在不同的解析器中返回的结果可能是不一样的。

解析器	使用方法	优势
Python 标准库	BeautifulSoup(html, "html.parser")	1、Python 的内置标准库 2、执行速度适中 3、文档容错能力强
lxml HTML	BeautifulSoup(html, "lxml")	1、速度快 2、文档容错能力强
lxml XML	BeautifulSoup(html, ["lxml", "xml"]) BeautifulSoup(html, "xml")	1、速度快 2、唯一支持 XML 的解析器
html5lib	BeautifulSoup(html, "html5lib")	1、最好的容错性 2、以浏览器的方式解析文档 3、生成 HTML5 格式的文档

格式化输出

soup.prettify()  # prettify 有括号和没括号都可以

# 二、基本使用

	from bs4 import BeautifulSoup

	# 构造一个网页数据
	html_doc = """
	<html>
	<head>
	<title>The Dormouse's story</title>
	</head>
	<body>
	<p class="title">
	<b>The Dormouse's story</b>
	</p>

	<p class="story">Once upon a time there were three little sisters; and their names were
	<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
	<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
	<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
	and they lived at the bottom of a well.</p>

	<p class="story">...</p>
	</body>
	</html>
	"""

# 2.1 获取标签

	res = BeautifulSoup(html_doc, 'lxml')

	print(res.a)

# 2.2 获取标签内文本

print(res.a.text)

# 2.3 获取标签内属性

print(res.a.attrs)

# 2.4 获取指定属性值

	print(res.a.attrs.get('href'))
	print(res.a.get('href'))

# 2.5 获取子节点

	for i in res.p.children:
	print(i)

# 2.6 获取标签内部所有的元素

print(res.p.contents)

# 2.7 获取标签的父标签

print(res.p.parent)

# 2.8 获取最上级节点

	for i in res.p.parents:
	print(i)

# 三、bs4 核心库

# 3.1 find

只能找符合条件的第一个该方法的返回结果是一个标签对象

# 3.1.1 查找指定标签名的标签默认只找符合条件的第一个

print(res.find(name='p'))

# 3.1.2 查找具有某个特定属性的标签默认只找符合条件的第一个

print(res.find(name='p', id='title'))

# 3.1.3 为了解决关键字冲突会加下划线区分

print(res.find(name='p', class_='title'))

# 3.1.4 使用 attrs 参数直接避免冲突

print(res.find(name='p', attrs={'class': 'title'}))

# 3.2 find_all

查找所有符合条件的标签该方法的返回结果是一个列表。

# 3.2.1 查询某一个标签，查找的结果是一个列表

print(res.find_all('a'))

# 3.3 select 方法

使用 css 选择器该方法的返回结果是一个列表。

# 3.3.1 查找 class 含有 title 的标签

print(res.select('.title'))

# 3.3.2 查看 class 含有 sister 标签内部所有的后代 span

print(res.select('.title b'))

# 3.3.3 查找 id 等于 title 的标签

print(res.select('#title'))

# 四、使用 bs4 爬取豆瓣电影排行榜

	from bs4 import BeautifulSoup
	import requests
	import re

	def main():

	head = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
	}

	baseurl = "https://movie.douban.com/top250?start="

	res = requests.get(url=baseurl, headers=head)

	connect = res.text

	res = BeautifulSoup(connect, 'lxml')

	video = res.select('.grid_view li')

	list = []

	for i in video:

	vidow = {
	"title": "",
	"year": "",
	"score": 0,
	"num": 0
	}

	for item in i.select('.title'):
	vidow['title'] += item.text.replace("\xa0", " ")

	for item in i.select('.other'):
	vidow['title'] += item.text.replace("\xa0", " ")

	for item in i.select(".bd p"):
	obj = re.compile('\d{4}', re.S)
	result = obj.finditer(item.text)
	for year in result:
	vidow['year'] = year.group()

	for item in i.select(".rating_num"):
	vidow['score'] = item.text

	vidow['num'] = i.select(".star span")[-1].text.replace("人评价", "")

	list.Python

# 五、使用 bs4 爬取能源学院官网（cqny.edu.cn）新闻中心所有超链接

示例代码 demo

	from bs4 import BeautifulSoup
	import requests
	import time

	host = "https://www.cqny.edu.cn/xwzx/"

	soup = BeautifulSoup(requests.get(host).text, 'html.parser')
	for item in soup.find_all('a'):
	print(item.get('href'))

输出结果（截至本文发出时间）

	https://www.cqny.edu.cn/
	https://www.cqny.edu.cn/
	https://www.cqny.edu.cn/#
	https://www.cqny.edu.cn/cqny/xygk/xueyuanjianjie.html
	https://www.cqny.edu.cn/cqny/xygk/xianrenlingdao.html
	https://www.cqny.edu.cn/cqny/xygk/xiaoyuanfengguang.html
	https://www.cqny.edu.cn/cqny/xygk/xiaofengxiaoxun.html
	https://www.cqny.edu.cn/cqny/xygk/jigoushezhi.html
	https://www.cqny.edu.cn/#
	https://www.cqny.edu.cn/jwc/
	https://www.cqny.edu.cn/xueyuanmingshi/
	https://www.cqny.edu.cn/sxzx/
	https://www.cqny.edu.cn/jpkc/
	https://www.cqny.edu.cn/cqny/jiaoyujiaoxue/keyangongzuo.html
	https://www.cqny.edu.cn/pxnyxy/
	https://www.cqny.edu.cn/pxnyxy
	https://www.cqny.edu.cn/#
	https://www.cqny.edu.cn/xsc/
	http://www.psy.com.cn/school/new/index.asp?school=66743
	https://www.cqny.edu.cn/tw/
	https://www.cqny.edu.cn/zsjy/
	https://www.cqny.edu.cn/djzc/
	https://www.cqny.edu.cn/tsg/
	https://www.cqny.edu.cn/xxgc
	https://www.cqny.edu.cn/cqny/xygk/xueyuanjianjie.html
	https://www.cqny.edu.cn/cqny/xygk/xianrenlingdao.html
	https://www.cqny.edu.cn/cqny/xygk/xiaoyuanfengguang.html
	https://www.cqny.edu.cn/cqny/xygk/xiaofengxiaoxun.html
	https://www.cqny.edu.cn/jwc/
	https://www.cqny.edu.cn/xueyuanmingshi/
	https://www.cqny.edu.cn/sxzx/
	https://www.cqny.edu.cn/jpkc/
	https://www.cqny.edu.cn/cqny/jiaoyujiaoxue/keyangongzuo.html
	https://www.cqny.edu.cn/pxnyxy/
	https://www.cqny.edu.cn/pxnyxy
	https://www.cqny.edu.cn/xsc/
	http://www.psy.com.cn/school/new/index.asp?school=66743
	https://www.cqny.edu.cn/tw/
	https://www.cqny.edu.cn
	https://www.cqny.edu.cn/cqny/
	https://www.cqny.edu.cn/cqny/gongzuozhouli1.html
	https://www.cqny.edu.cn/xwzx/
	https://www.cqny.edu.cn/xwzx2023/43771.html
	https://www.cqny.edu.cn/xwzx2023/43770.html
	https://www.cqny.edu.cn/xwzx2023/43752.html
	https://www.cqny.edu.cn/xwzx2023/43722.html
	https://www.cqny.edu.cn/xwzx2023/43721.html
	https://www.cqny.edu.cn/xwzx2023/43719.html
	https://www.cqny.edu.cn/xwzx2023/43717.html
	https://www.cqny.edu.cn/xwzx2023/43706.html
	https://www.cqny.edu.cn/xwzx2023/43704.html
	https://www.cqny.edu.cn/xwzx2023/43703.html
	https://www.cqny.edu.cn/xwzx2023/43705.html
	https://www.cqny.edu.cn/xwzx2023/43698.html
	https://www.cqny.edu.cn/xwzx2023/43694.html
	https://www.cqny.edu.cn/xwzx2023/43685.html
	https://www.cqny.edu.cn/xwzx2023/43684.html
	https://www.cqny.edu.cn/xwzx2023/43693.html
	https://www.cqny.edu.cn/xwzx2023/43661.html
	https://www.cqny.edu.cn/xwzx2023/43650.html
	https://www.cqny.edu.cn/xwzx2023/43649.html
	https://www.cqny.edu.cn/xwzx2023/43648.html
	https://www.cqny.edu.cn/xwzx2023/43646.html
	https://www.cqny.edu.cn/xwzx2023/43645.html
	https://www.cqny.edu.cn/xwzx2023/43639.html
	https://www.cqny.edu.cn/xwzx2023/43638.html
	None
	/xwzx/index.html
	/xwzx/index_2.html
	/xwzx/index_3.html
	/xwzx/index_4.html
	/xwzx/index_5.html
	/xwzx/index_6.html
	/xwzx/index_7.html
	/xwzx/index_8.html
	/xwzx/index_9.html
	/xwzx/index_10.html
	/xwzx/index_165.html
	/xwzx/index_2.html
	https://www.cqny.edu.cn/xwzx2019/30232.html
	https://www.cqny.edu.cn/xwzx2020/32912.html
	https://www.cqny.edu.cn/xwzx2020/34245.html
	https://www.cqny.edu.cn/xwzx2020/34208.html
	https://www.cqny.edu.cn/xwzx2020/34248.html
	https://www.cqny.edu.cn/xwzx2020/34224.html
	https://www.cqny.edu.cn/xwzx2020/34230.html
	https://www.cqny.edu.cn/xwzx2020/34263.html
	https://www.cqny.edu.cn/xwzx2020/34239.html
	https://www.cqny.edu.cn/xwzx2020/34223.html
	http://www.moe.gov.cn/
	http://www.edu.cn/
	http://www.chinaedu.edu.cn/
	http://www.cq.gov.cn/
	http://jw.cq.gov.cn/
	http://www.cqzjw.com.cn/
	http://www.smartedu.cn/

Python学习笔记