- 论坛徽章:
- 0
|
毕设时写的一个抓取图片url"类爬虫程序",更新为第三个版本
#!/usr/bin/python
from htmllib import *
import formatter,urllib2,re,threading,sys,urllib
import urlparse
class imageParser(HTMLParser):
def __init__(self,outfile,baseurl):
self.out = outfile
self.baseurl = baseurl
HTMLParser.__init__(self,formatter.NullFormatter())
def do_img(self,attrs):
for attr in attrs:
if attr[0] == "src":
if attr[1][0] == "h":
self.out.write("%s\n"%(attr[1]))
else:
self.out.write("%s\n"%(urlparse.urljoin(self.baseurl,attr[1])))
pass
class pageWorker (threading.Thread):
def __init__(self,output,url,accepts,level):
threading.Thread.__init__(self)
self.output = output
self.workurl = url
self.acceptlist = accepts
self.level = level
def run(self):
self.getUrlList(self.output,self.workurl,self.acceptlist,self.level)
def getUrlList(self,output,url,accepts,level):
if not level:
return
req = urllib2.Request(url)
fd = 0
try:
fd = urllib2.urlopen(req)
finally:
if not fd:
return
info = fd.info()
for accept in accepts:
if re.match(accept,info["content-type"]):
output.write("%s\n"%(url))
if re.match("text/html.*",info["content-type"]):
parser = HTMLParser(formatter.NullFormatter())
try:
accepts.index("image/.*")
parser = imageParser(output,url)
except ValueError:
pass
parser.feed(fd.read())
for suburl in parser.anchorlist:
pageWorker(output,suburl,accepts,level-1).start()
fd.close()
#see http://www.iana.org/assignments/media-types/
if __name__ == "__main__":
sys.stderr=open("err.log","w")
accepts = []
print sys.argv
if len(sys.argv) >2:
output = open(sys.argv[2],"w")
else:
output = sys.stdout
argv = sys.argv[2:]
for arg in argv:
if arg == "image":
accepts.append("image/.*")
elif arg == "video":
accepts.append("video/.*")
elif arg == "audio":
accepts.append("audio/.*")
rootpageworker = pageWorker(output,sys.argv[1],accepts,3)
rootpageworker.run(); |
不知道找个javascript的虚拟机能不能分析那些动态页面
[ 本帖最后由 reiase 于 2008-8-16 12:15 编辑 ] |
|