论坛徽章:: 0

电梯直达

1楼 [收藏(0)] [报告]

发表于 2010-11-02 14:49 |只看该作者 |倒序浏览

转自：http://www.cnpythoner.com/post/122.html

#encoding=utf-8
#@author:老王python
#@description:检查网站的外链
import urllib
from sgmllib import SGMLParser
import re
import sys
infolist = {}#结果列表
class LinkParser(SGMLParser):
'''抓取link列表的信息'''
def reset(self):
SGMLParser.reset(self)
self.url = ''#链接文本
self.li_check = 0
self.a_check = 0
self.jt_url = ''#具体url
self.infolist = {}
def start_li(self, tag):
'''检测li开头'''
if tag:
if tag[0][1] == 'sco5li0' or tag[0][1] == 'sco5li1':
self.li_check = 1
def start_a(self,tag):
'''检测a开头'''
if self.li_check == 1:
if not tag[0][1].startswith('http://203.209.253.250'):
host_re = re.compile(r'^https?://(.*?)($|/)',
re.IGNORECASE
)
self.url = host_re.search(tag[0][1]).group(1)
self.jt_url = tag[0][1]
self.a_check = 1
def handle_data(self, text):
'''处理空白文本'''
txt = text.strip()
if txt and self.a_check and txt != '快照':
checkurl = '%s,%s' % (self.url,self.jt_url)
self.infolist[checkurl] = txt
if txt == '':
return
def end_li(self):
self.li_check = 0
def end_a(self):
self.a_check = 0
numre = re.compile(r'<strong>.+')#匹配总的记录数
pnum = re.compile(r'\d+')
checkurl = ''#查询网站的地址，比如http://www.xxx.com
checkurl = urllib.quote(checkurl)#请求地址
pageurl = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p=%s' % (checkurl)
content = urllib.urlopen(pageurl).read()
c = numre.search(content).group(0)
totalnum = int(pnum.search(c).group(0))#总的外链数
host_re = re.compile(r'^http://(?P<host>www\.(?:[A-Z0-9-]+\.){1}[A-Z\.]{2,6}),
re.IGNORECASE
)
pagesize = 50#一页显示50条
if totalnum % pagesize:
page = totalnum/pagesize
else:
page = (totalnum/pagesize)+1
f = file('a.txt','w')
for k in xrange(page):
parser = LinkParser()
url = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p=%s&b=%s' % (checkurl,k*50)
print 'url=========>',url
cstr = urllib.urlopen(url).read()
parser.feed(cstr)
parser.close()
for m in parser.infolist:
domain,jt_url = m.split(',')
print 'domain--------->',domain
print 'jt_url--------->',jt_url
t = 'url:%s,jt_url:%s,title:%s\n' % (domain,jt_url,parser.infolist[m].decode('utf-8').encode('utf-8'))
f.write(t)
f.close()

复制代码

文库|博客

返回列表

Chinaunix › 论坛 › 程序设计 › Python › python获取网站外链

python获取网站外链 [复制链接]

浏览过的版块