Python 多线程爬虫自动Killed问题
我写了一个多线程抓取9游游戏论坛帖子的爬虫, 每次最多建立100个线程去抓取,当startPage=1, endPage<100的时候可以正常运行, 但当endPage > 100的时候,总是在处理完1~101页之后自动Killed 掉, 请问这是什么原因?有什么解决的办法?#!/usr/bin/env python# encoding: utf-8
import sys
import urllib2
import threading
from bs4 import BeautifulSoup
suffix = "-1.html"
def analyse(start, end, url):
threads = []
while start <= end:
i = 0
while start <= end:
if i > 100 :
break
t = threading.Thread(target=getContent, args=(url + str(start) + suffix, i))
threads.append(t)
start = start + 1
i += 1
print(i)
for thread_obj in threads:
thread_obj.start()
for thread_obj in threads:
thread_obj.join()
del threads[:]
def getContent(url, index):
req= urllib2.Request(url, headers={"user-Agent":"Magic Browser"})
page = urllib2.urlopen(req)
html = page.read()
soup = BeautifulSoup(html)
userName = []
floorNum = []
replyCon = []
result = ""
#get user name
for auth in soup.find_all('div',{"class":"i y"}):
for ya in auth.stripped_strings:
userName.append(unicode(ya))
break
#get floor number
click = "setCopy(this.href, '帖子地址复制成功');return false;"
for s in soup.find_all('a', {"onclick": click}):
for ya in s.strings:
floorNum.append(unicode(ya))
break
#get reply content
for s in soup.find_all('div', {"class": "t_fsz"}):
tmpList = []
for ya in s.stripped_strings:
tmpList.append(unicode(ya))
replyCon.append(''.join(tmpList))
del tmpList[:]
#merge name, number, replay and return
i = 0
while i < len(userName):
single = floorNum + "\t\t" + userName + "\t\t" + replyCon + "\n"
print single
result = result + single
i = i + 1
page.close()
if __name__ == "__main__":
reload(sys)
sys.setdefaultencoding('utf8')
#url = raw_input("输入待分析帖子的ID:")
#startPage = string.atoi(raw_input("输入起始页码:"))
#endPage = string.atoi(raw_input("输入终止页码:"))
#url = "http://bbs.9game.cn/thread-6688328-4-1.html"
url=6651496
startPage=1
endPage=130
url = "http://bbs.9game.cn/thread-" + str(url) + "-"
if startPage > endPage:
print("页码参数错误")
exit(0)
analyse(startPage, endPage, url)
qq17920 发表于 2014-09-11 12:53 static/image/common/back.gif
我写了一个多线程抓取9游游戏论坛帖子的爬虫, 每次最多建立100个线程去抓取,当startPage=1, endPage 100的时 ...
之前把这个脚本在我的内存128MB的VPS上运行, 不停的被KILLED, 后来在本地电脑上运行, 顺畅运行到结束. 可能是由于内存请求过大导致不停被KILLED, 程序在本地window xp运行的时候内存占用基本维持在200MB~300MB之间. 使用内存过多被OOM了
页:
[1]