- 论坛徽章:
- 0
|
以下是俺做的小爬虫,才学了几天,牛牛们不要见笑。
功能是爬取特定关键词的所有链接。
觉得LZ可以把所有链接爬下来,一直爬到最后一页,然后除10,比如一共有30页结果,到31的时候,内容就不变了,那么一共爬了300个链接,然后再做判断。
----------------------------------------------------------------------------
# -*- coding: gbk -*-
import os,sys
import httplib
import re
import string
import fileinput
from urllib import FancyURLopener
from urllib import urlopen
from random import choice
user_agents = [
'Mozilla/6.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/6.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/6.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/6.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]
class MyOpener(FancyURLopener, object):
version = choice(user_agents)
myopener = MyOpener()
search=str(raw_input('请输入关键词(如asp?id=):'))
k=int(raw_input('请输入抓取google搜索页数(请输入1-50之间整数):'))
filename=str(raw_input('请输入文件路径(如c:\list.txt):'))
f=open(filename,'a')
print '开始爬行,请稍等'
for x in range(0,k*10,10):
print "已爬完第%s页"%(x/10+1)
URL="http://www.google.cn/search?q=%s&start=%d" %(search,x)
sock = myopener.open(URL)
htmlSource=sock.read()
htmlstr=str(htmlSource)
a=re.findall(r'(?<=\<cite\>).+?(?=\<\/cite\>)',htmlstr)
for i in a:
strURL1= i.replace('<b>','')
strURL2 = strURL1.replace('</b>','')
p=re.compile(r'(?P<word>\S+\b)')
p1=p.search(strURL2)
strURL=str(p1.group('word')+'\n')
f.writelines(strURL)
delay=5
f.close()
s = 0
for line in fileinput.input(filename):
s += 1
print '爬完了哦,共抓取%s链接%d条,请查看列表%s' %(search,s,filename) |
|