- 论坛徽章:
- 0
|
我写了一个多线程抓取9游游戏论坛帖子的爬虫, 每次最多建立100个线程去抓取,当startPage=1, endPage<100的时候可以正常运行, 但当endPage > 100的时候,总是在处理完1~101页之后自动Killed 掉, 请问这是什么原因? 有什么解决的办法?- #!/usr/bin/env python
- # encoding: utf-8
- import sys
- import urllib2
- import threading
- from bs4 import BeautifulSoup
- suffix = "-1.html"
- def analyse(start, end, url):
- threads = []
- while start <= end:
- i = 0
- while start <= end:
- if i > 100 :
- break
- t = threading.Thread(target=getContent, args=(url + str(start) + suffix, i))
- threads.append(t)
- start = start + 1
- i += 1
- print(i)
- for thread_obj in threads:
- thread_obj.start()
- for thread_obj in threads:
- thread_obj.join()
- del threads[:]
-
- def getContent(url, index):
- req = urllib2.Request(url, headers={"user-Agent":"Magic Browser"})
- page = urllib2.urlopen(req)
- html = page.read()
- soup = BeautifulSoup(html)
- userName = []
- floorNum = []
- replyCon = []
- result = ""
- #get user name
- for auth in soup.find_all('div',{"class":"i y"}):
- for ya in auth.stripped_strings:
- userName.append(unicode(ya))
- break
- #get floor number
- click = "setCopy(this.href, '帖子地址复制成功');return false;"
- for s in soup.find_all('a', {"onclick": click}):
- for ya in s.strings:
- floorNum.append(unicode(ya))
- break
- #get reply content
- for s in soup.find_all('div', {"class": "t_fsz"}):
- tmpList = []
- for ya in s.stripped_strings:
- tmpList.append(unicode(ya))
- replyCon.append(''.join(tmpList))
- del tmpList[:]
- #merge name, number, replay and return
- i = 0
- while i < len(userName):
- single = floorNum[i] + "\t\t" + userName[i] + "\t\t" + replyCon[i] + "\n"
- print single
- result = result + single
- i = i + 1
- page.close()
- if __name__ == "__main__":
- reload(sys)
- sys.setdefaultencoding('utf8')
- #url = raw_input("输入待分析帖子的ID:")
- #startPage = string.atoi(raw_input("输入起始页码:"))
- #endPage = string.atoi(raw_input("输入终止页码:"))
- #url = "http://bbs.9game.cn/thread-6688328-4-1.html"
- url=6651496
- startPage=1
- endPage=130
- url = "http://bbs.9game.cn/thread-" + str(url) + "-"
- if startPage > endPage:
- print("页码参数错误")
- exit(0)
- analyse(startPage, endPage, url)
复制代码 |
|