- 论坛徽章:
- 1
|
- # -*- encoding=utf8 -*-
- from bs4 import BeautifulSoup
- import re
- html_doc='''
- <div class='row'>
- <h3> AA </h3>
- <p>0. AA0<a href="www.google.com" >google1</a></p>
- <p>2. AA2<a href="www.baidu.com" >baidu2</a></p>
- <p>some</p>
- <p>1. AAc</p>
- <p>AAd </p>
- '''
- soup = BeautifulSoup(html_doc, 'lxml',from_encoding='utf8')
- titles =soup.select("div.row h3")
- for title in titles:
- print("--------------------")
- for paragraph in title.find_next_siblings("p",text=re.compile("\d")):
- print(paragraph.text)
复制代码
不写 text=re.compile("\d") 直接打印 paragraph.text 是 0. AA0google1
但是 text=re.compile("\d") 就不行了
|
|