- 论坛徽章:
- 0
|
感谢楼上的帮助
最后测试是可行的 但是还是慢,
东找西找,找到一个,通过下面的方法可以加快速度,但是可惜必须把文件拆开执行
1000个url 需要拆成100行一个(测试100行的数据数据平均最快,5个文件只需要41s处理完成)
以下是代码,也是利用的线程池,代码重复厉害,忘高手别p我。
#!/usr/bin/env python
#coding=utf-8
import Queue, threading, sys
from threading import Thread
import time,urllib2
import os
# working thread
class Worker(Thread):
worker_count = 0
def __init__( self, workQueue, resultQueue, timeout = 0, **kwds):
Thread.__init__( self, **kwds )
self.id = Worker.worker_count
Worker.worker_count += 1
self.setDaemon( True )
self.workQueue = workQueue
self.resultQueue = resultQueue
self.timeout = timeout
def run( self ):
''' the get-some-work, do-some-work main loop of worker threads '''
while True:
try:
callable, args, kwds = self.workQueue.get(timeout=self.timeout)
res = callable(*args, **kwds)
print "worker[%2d]: %s" % (self.id, str(res) )
self.resultQueue.put( res )
except Queue.Empty:
break
except :
print 'worker[%2d]' % self.id, sys.exc_info()[:2]
class WorkerManager:
def __init__( self, num_of_workers=10, timeout = 1):
self.workQueue = Queue.Queue()
self.resultQueue = Queue.Queue()
self.workers = []
self.timeout = timeout
self._recruitThreads( num_of_workers )
def _recruitThreads( self, num_of_workers ):
for i in range( num_of_workers ):
worker = Worker( self.workQueue, self.resultQueue, self.timeout )
self.workers.append(worker)
def start(self):
for w in self.workers:
w.start()
def wait_for_complete( self):
# ...then, wait for each of them to terminate:
while len(self.workers):
worker = self.workers.pop()
worker.join( )
if worker.isAlive() and not self.workQueue.empty():
self.workers.append( worker )
print "All jobs are are completed."
def add_job( self, callable, *args, **kwds ):
self.workQueue.put( (callable, args, kwds) )
def get_result( self, *args, **kwds ):
return self.resultQueue.get( *args, **kwds )
start = time.time()
def test_job1(id, sleep = 0.001 ):
file_patch = os.path.join(os.path.dirname(__file__),'t100.txt')
#以读写模式打开文件
f = file(file_patch, 'r')
for eachLine in f:
#判断第一个字符是否是#号,如果是就忽略
if eachLine[:1] != '#':
try:
#urllib.urlopen('https://www.gmail.com/').read()
host = 'http://' + eachLine.split()[1]
response = urllib2.urlopen(host, timeout = 1)
print "ok %s," % host
except:
#print '[%4d]' % id, sys.exc_info()[:2]
print "error %s," % host
f.close()
return id
def test_job2(id, sleep = 0.001 ):
file_patch = os.path.join(os.path.dirname(__file__),'t200.txt')
#以读写模式打开文件
f = file(file_patch, 'r')
for eachLine in f:
#判断第一个字符是否是#号,如果是就忽略
if eachLine[:1] != '#':
try:
#urllib.urlopen('https://www.gmail.com/').read()
host = 'http://' + eachLine.split()[1]
response = urllib2.urlopen(host, timeout = 1)
print "ok %s," % host
except:
#print '[%4d]' % id, sys.exc_info()[:2]
print "error %s," % host
f.close()
return id
def test_job3(id, sleep = 0.001 ):
file_patch = os.path.join(os.path.dirname(__file__),'t300.txt')
#以读写模式打开文件
f = file(file_patch, 'r')
for eachLine in f:
#判断第一个字符是否是#号,如果是就忽略
if eachLine[:1] != '#':
try:
#urllib.urlopen('https://www.gmail.com/').read()
host = 'http://' + eachLine.split()[1]
response = urllib2.urlopen(host, timeout = 1)
print "ok %s," % host
except:
#print '[%4d]' % id, sys.exc_info()[:2]
print "error %s," % host
f.close()
return id
def test_job4(id, sleep = 0.001 ):
file_patch = os.path.join(os.path.dirname(__file__),'t400.txt')
#以读写模式打开文件
f = file(file_patch, 'r')
for eachLine in f:
#判断第一个字符是否是#号,如果是就忽略
if eachLine[:1] != '#':
try:
#urllib.urlopen('https://www.gmail.com/').read()
host = 'http://' + eachLine.split()[1]
response = urllib2.urlopen(host, timeout = 1)
print "ok %s," % host
except:
#print '[%4d]' % id, sys.exc_info()[:2]
print "error %s," % host
f.close()
return id
def test_job5(id, sleep = 0.001 ):
file_patch = os.path.join(os.path.dirname(__file__),'t500.txt')
#以读写模式打开文件
f = file(file_patch, 'r')
for eachLine in f:
#判断第一个字符是否是#号,如果是就忽略
if eachLine[:1] != '#':
try:
#urllib.urlopen('https://www.gmail.com/').read()
host = 'http://' + eachLine.split()[1]
response = urllib2.urlopen(host, timeout = 1)
print "ok %s," % host
except:
#print '[%4d]' % id, sys.exc_info()[:2]
print "error %s," % host
f.close()
return id
def test():
import socket
socket.setdefaulttimeout(10)
print 'start testing'
#开启的线程数
wm = WorkerManager(10)
#i是执行的次数
for i in [1]:
wm.add_job( test_job1, i, i*0.001 )
wm.add_job( test_job2, i, i*0.001 )
wm.add_job( test_job3, i, i*0.001 )
wm.add_job( test_job4, i, i*0.001 )
wm.add_job( test_job5, i, i*0.001 )
wm.start()
wm.wait_for_complete()
print 'end testing'
test()
print "Elapsed Time: %s" % (time.time() - start)
============================
我看到网上的好多都是stackless比python自己的线程快,但是我写了一个,还是好慢,估计学习不到家,不知道是否有人可以给我一个这样的例子 |
|