- 论坛徽章:
- 0
|
这段代码爬链接少的,可以正常返回,但爬链接多一些的,就返回不了了
不知道哪写错了,求教~
Python code
# coding:utf-8
import threading
import urllib
from Queue import Queue
from BeautifulSoup import BeautifulSoup
import re
class Worker(threading.Thread):
""" thread pool class """
def __init__(self, queue, deep, empty_urls):
super(Worker, self).__init__()
self.queue = queue
self.deep = deep
self.urls_list = empty_urls
def getpage(self, urltuple):
""" 得到一个页面的所有链接 """
level, url = urltuple
try:
page = urllib.urlopen(url).read()
except UnicodeEncodeError, uee:
page = urllib.urlopen(url.encode("utf-8").read()
print uee
return None
except IOError, ioe:
print ioe
return None
except Exception:
return None
if level >= self.deep:
return page
try:
soup = BeautifulSoup(page)
except:
print "抓取错误", url
return None
# print soup
urlall = soup.findAll('a', onclick=None, href=re.compile('^http^/'))
if url.endswith('/'):
url = url[:-1]
url = "http://www.baidu.com"
for i in urlall:
if i['href'].startswith('/'):
i['href'] = url + i['href']
self.urls_list.lock()
if i['href'] not in self.urls_list.urls:
self.urls_list.urls.append(i['href'])
self.queue.put((level+1, i['href']))
self.urls_list.release()
return page
def run(self):
while True:
urltuple = self.queue.get()
#self.log.write(urltuple[1])
try:
print urltuple[1]
except UnicodeEncodeError, uee:
print uee
print urltuple[1].encode("UTF-8"
page = self.getpage(urltuple)
self.queue.task_done()
class EmptyUrl(object):
"""docstring for EmptyUrl"""
def __init__(self):
super(EmptyUrl, self).__init__()
self.urls = []
self.mutex = threading.Lock()
def lock(self):
""" get_lock """
self.mutex.acquire()
def release(self):
""" release """
self.mutex.release()
def main():
""" start """
queue = Queue(0)
i = 0
url = "http://www.baidu.com/"
queue.put((0, url))
urls = EmptyUrl()
urls.urls.append(url)
while i < 50:
work = Worker(queue, 2, urls)
work.setDaemon(True)
work.start()
i += 1
queue.join()
if __name__ == '__main__':
main()
|
|