Python爬虫从网页中通过Next连接到下一页
大家好,我想在下面的网页中进行爬虫获得信息,在该页面中的Next连接到下一页,但是我不知道怎么让爬虫能自动有Next连接到下一页,请大家指点下,先谢谢大家了
http://www.ncbi.nlm.nih.gov/pubmed/?term=RNA-seq
网页中Next的html代码见下图
下面是我爬虫代码
#!python
import sys
import mechanize
from bs4 import BeautifulSoup
import re
#brower
br=mechanize.Browser()
#options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
#follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
#debugging?
br.set_debug_http(True)
br.set_debug_redirects(True)
br.set_debug_responses(True)
#user-agent(This is a cheating.)
br.addheaders=[('User-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1 ')]
#get the e-mail address
r=br.open('http://www.ncbi.nlm.nih.gov/pubmed/?term=RNA-seq')
html=r.read()# html is a str type
list_paperid=re.findall(r'<div class="rslt"><p class="title"><a href="/pubmed/(\d+)"',html)
list=[]
for i in list_paperid:
r_paper=br.open('http://www.ncbi.nlm.nih.gov/pubmed/'+i)
html_paper=r_paper.read()
mail=re.findall(r' ?([\w\._]+@[\w\.]+)\.</li></ul></div>',html_paper)
for j in mail:
list.append(j)
mail=[]
print list
print len(list)
print '##########################'
#link into the next page
soup=BeautifulSoup(html)
print soup.find(id="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Entrez_Pager.Page")
#print soup.find(title="Next page of results")
if soup.find(title='next page of results'):
print 'AAAAA'
print soup.find(title='next page of results').find('a').get('href')
#write the mail address into files
mail_china=open('mail_china','w')
mail_outofchina=open('mail_outofchina','w')
for each in list:
if re.search(r'\.edu\.cn$',each):
mail_china.write(each+'\n')
else:
mail_outofchina.write(each+'\n')
mail_china.close()
mail_outofchina.close()
看了下,那个页面的列表页是通过POST更新的。自己构造下header,然后主动提交应该就可以了。
页:
[1]