bs4的find_next_siblings通过正则过滤不能含子标签
# -*- encoding=utf8 -*-
from bs4 import BeautifulSoup
import re
html_doc='''
<div class='row'>
<h3> AA </h3>
<p>0. AA0<a href="www.google.com" >google1</a></p>
<p>2. AA2<a href="www.baidu.com" >baidu2</a></p>
<p>some</p>
<p>1. AAc</p>
<p>AAd </p>
'''
soup = BeautifulSoup(html_doc, 'lxml',from_encoding='utf8')
titles =soup.select("div.row h3")
for title in titles:
print("--------------------")
for paragraph in title.find_next_siblings("p",text=re.compile("\d")):
print(paragraph.text)
不写text=re.compile("\d") 直接打印paragraph.text是 0. AA0google1
但是 text=re.compile("\d")就不行了
页:
[1]