- 论坛徽章:
- 0
|
转自:http://www.cnpythoner.com/post/122.html- #encoding=utf-8
- #@author:老王python
- #@description:检查网站的外链
- import urllib
- from sgmllib import SGMLParser
- import re
- import sys
- infolist = {}#结果列表
- class LinkParser(SGMLParser):
- '''抓取link列表的信息'''
- def reset(self):
- SGMLParser.reset(self)
- self.url = ''#链接文本
- self.li_check = 0
- self.a_check = 0
- self.jt_url = ''#具体url
- self.infolist = {}
- def start_li(self, tag):
- '''检测li开头'''
- if tag:
- if tag[0][1] == 'sco5li0' or tag[0][1] == 'sco5li1':
- self.li_check = 1
- def start_a(self,tag):
- '''检测a开头'''
- if self.li_check == 1:
- if not tag[0][1].startswith('http://203.209.253.250'):
- host_re = re.compile(r'^https?://(.*?)($|/)',
- re.IGNORECASE
- )
- self.url = host_re.search(tag[0][1]).group(1)
- self.jt_url = tag[0][1]
- self.a_check = 1
- def handle_data(self, text):
- '''处理空白文本'''
- txt = text.strip()
- if txt and self.a_check and txt != '快照':
- checkurl = '%s,%s' % (self.url,self.jt_url)
- self.infolist[checkurl] = txt
-
- if txt == '':
- return
-
- def end_li(self):
- self.li_check = 0
-
- def end_a(self):
- self.a_check = 0
- numre = re.compile(r'<strong>.+')#匹配总的记录数
- pnum = re.compile(r'\d+')
- checkurl = ''#查询网站的地址,比如http://www.xxx.com
- checkurl = urllib.quote(checkurl)#请求地址
- pageurl = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p=%s' % (checkurl)
- content = urllib.urlopen(pageurl).read()
- c = numre.search(content).group(0)
- totalnum = int(pnum.search(c).group(0))#总的外链数
- host_re = re.compile(r'^http://(?P<host>www\.(?:[A-Z0-9-]+\.){1}[A-Z\.]{2,6}),
- re.IGNORECASE
- )
- pagesize = 50#一页显示50条
- if totalnum % pagesize:
- page = totalnum/pagesize
- else:
- page = (totalnum/pagesize)+1
- f = file('a.txt','w')
- for k in xrange(page):
- parser = LinkParser()
- url = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p=%s&b=%s' % (checkurl,k*50)
- print 'url=========>',url
- cstr = urllib.urlopen(url).read()
- parser.feed(cstr)
- parser.close()
-
- for m in parser.infolist:
- domain,jt_url = m.split(',')
- print 'domain--------->',domain
- print 'jt_url--------->',jt_url
- t = 'url:%s,jt_url:%s,title:%s\n' % (domain,jt_url,parser.infolist[m].decode('utf-8').encode('utf-8'))
- f.write(t)
- f.close()
复制代码 |
|