- 论坛徽章:
- 0
|
- #coding:utf8
- '''
- Created on 2014-7-31
- @author: shiban
- '''
- import urllib2
- import urllib
- import random
- import MySQLdb
- class CrawlerClass(object):
- def __init__(self,max):
- self.max = max
- #链接模板
- self.index = 'http://zhidao.baidu.com/question/'
- #链接数据库
- self.saver = SaverMySqlClass()
- def cutSpit(self,txt,head,tail):
- '''
- 函数类似吃甘蔗:txt是一根甘蔗,
- 从头咬,返回的是head与tail之间的部分,
- 以及剩余的半根甘蔗 截取txt文本中,
- 包含在head与tail之间的文字。
- '''
- pos=0
- pos=txt.find(head,pos,len(txt))
- if pos < 0:
- return ''
- assert(pos>=0)
- pos+=len(head)
- oldpos=pos
- pos=txt.find(tail,oldpos,len(txt))
- if pos<0:
- return ''
- return txt[oldpos:pos]
-
- def getPageToAgency(self,url):
- """功能:使用代理ip的方式抓取页面数据
- 传入参数:url 需要得到的页面地址
- 返回值:page 页面的信息
- """
- proxy_lt = ('211.167.112.14:80','210.32.34.115:8080','115.47.8.39:80','211.151.181.41:80','219.239.26.23:80','219.157.200.18:3128','219.159.105.180:8080','1.63.18.22:8080','221.179.173.170:8080','125.39.66.153:80','125.39.66.151:80','61.152.108.187:80','222.217.99.153:9000','125.39.66.146:80','120.132.132.119:8080','119.7.221.137:82','117.41.182.188:8080','202.116.160.89:80','221.7.145.42:8080','211.142.236.131:80','119.7.221.136:80','211.151.181.41:80','125.39.66.131:80','120.132.132.119:8080','112.5.254.30:80','106.3.98.82:80','119.4.250.105:80','123.235.12.118:8080','124.240.187.79:80','182.48.107.219:9000','122.72.2.180:8080','119.254.90.18:8080','124.240.187.80:83','110.153.9.250:80','202.202.1.189:80','58.67.147.205:8080','111.161.30.228:80','122.72.76.130:80','122.72.2.180:80','202.112.113.7:80','218.108.85.59:81','211.144.72.154:80','119.254.88.53:8080','121.14.145.132:82','114.80.149.183:80','111.161.30.239:80','182.48.107.219:9000','122.72.0.28:80','125.39.68.131:80','118.244.190.6:80','120.132.132.119:88','211.167.112.15:82','221.2.80.126:8888','219.137.229.214:3128','125.39.66.131:80','61.181.22.157:80','115.25.216.6:80','119.7.221.137:82','221.195.42.195:8080','119.254.88.53:8080','219.150.254.158:8080','113.9.163.101:8080','222.89.154.14:9000','114.141.162.53:8080','218.5.74.199:3128','61.152.108.187:80','218.76.159.133:80','59.34.57.88:8080','118.244.190.34:80','59.172.208.189:8080','116.236.216.116:8080','111.161.30.233:80','220.248.237.234:8080','121.14.145.132:82','202.114.205.125:8080')
- i = random.randint(0, len(proxy_lt)-1)
- agency_ip = proxy_lt[i]
- proxies = {'':agency_ip}
- opener = urllib.FancyURLopener(proxies)
- req = opener.open(url)
- page = req.read()
- page = page.decode('gbk')
- return page
-
- def dealPageInfo(self,page,url):
- """处理page信息 得到title标签中数据
- 传入参数 page 页面源码
- url 页面地址
- """
- title = self.cutSpit(page, '<title>', '</title>')
- if title.find(u'百度知道 - 信息提示') <> -1:
- return
- else:
- info_lt = [url,title]
- self.saver.insertPrice(info_lt)
- def mainMehod(self):
- """程序主方法"""
- for i in xrange(self.max):
- url = self.index + '%s.html' % i
- #print url
- page = self.getPageToAgency(url)
- self.dealPageInfo(page,url)
- #关闭数据库
- self.saver.closeDatabase()
- class SaverMySqlClass(object):
-
- def __init__(self):
- self.conn = MySQLdb.connect(host='127.0.0.1',user='root',passwd='',db='test',port=3306,charset="utf8")
- #链接数据库
- self.cur=self.conn.cursor()
-
-
- def closeDatabase(self):
- self.cur.close()
-
- def insertPrice(self,info_lt):
- """写入价格信息库"""
- sql=r'''insert into tablename1 (
- url,title )
- values ("%s","%s")''' % tuple(info_lt)
- print sql
- try:
- self.cur.execute(sql)
- except Exception,e:
- print e
- self.conn.commit()
- self.conn.commit()
- if __name__ == '__main__':
- #传入需要访问的最大值
- max = 2000
- CrawlerClass(max).mainMehod()
复制代码 |
|