- 论坛徽章:
- 3
|
本帖最后由 wulien88 于 2012-12-05 16:35 编辑
经常去美图网上看图片,于是就想搞个爬虫去自动下载,反复搞了一星期,都是动态链接不好解析出来,开始还以为手动的写的爬虫不给力,于是又学习了一下scrapy,以为可以把动态的链接给抓下来,没想到还是不行,http://blog.scrapy.org/scraping-ajax-sites-with-scrapy也说了要手动分析,感谢crifan的教程,于是利用firebug去分析了一下ajxa动态链接,最后算是可以自动抓取了,程序写的比较随意,就是一个思路,欢迎大家一起学习交流。
import urllib
import urllib2
import urlparse
import sgmllib
import re
import pprint
import os
# 流程:
# 1. 入口链接:http://www.meitu.com
# 2:解析出包含的图片链接,如:http://www.meitu.com/12873415/photo/69962,http://www.meitu.com/22266296/photo/69366
# 3.解析2中链接,获得两个id值,见4中两个id
# 4.封装ajax动态链接,格式:http://www.meitu.com/photos/ajax ... 6773&direction=,并发送请求
# 5.解析4中链接,得到图片下载链接:http://img14.meitudata.com/20121 ... l.jpg!thumbnail1000
class findLink(sgmllib.SGMLParser):
def __init__(self):
sgmllib.SGMLParser.__init__(self)
self.links = []
#获得包含图片的链接,如http://www.meitu.com/12873415/photo/69962
def start_a(self, attributes):
for link in attributes:
tag, attr = link[0 : 2]
if "href" == tag:
self.links.append(attr)
class Retrieve(object):
def __init__(self, url):
#下载图片的保存路径
self.localdir = "F:\\pic\\autoCatch"
self.url = url
self.file = self.filePath(url)
def filePath(self, url):
if len(url):
#解析http://img14.meitudata.com/20121 ... l.jpg!thumbnail1000,获得文件夹和文件名
#F:\pic\autoCatch\201211\fhb1plyt88ty9nz2ol.jpg
parsedurl = urlparse.urlparse(url)
path = parsedurl[2]
path = path.split("/")
name = path[-1].split(".")[0]
path = "\\" + path[1]
path = self.localdir + path + "\\"
try:
os.mkdir(path)
except WindowsError:
print "dir is already exist!"
name = path + name + '.jpg'
else:
name = ""
RuntimeError("parsing url is empty!")
return name
def download(self):
if len(url):
retval = urllib.urlretrieve(self.url, self.file)
print "DownLoad %s is OK" % self.file
else:
RuntimeError("DownLoad is error!")
class spider(object):
def __init__(self, url):
self.url = url;
def getPage(self, url):
#入口链接地址,并封装头部
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
self.page = opener.open(url).read()
return self.page
def parseUrl(self):
linkUrl = findLink()
linkUrl.feed(self.page)
rFilter = "http://www.meitu.com/([\d]+)/photo/([\d]+)"
self.q = []
for link in linkUrl.links:
success = re.match(rFilter, link)
if success is not None:
# print success.group()
self.q.append(success.group())
return self.q
def parsePicAndGetUrl(self, page):
rTarget_id = "id=\"target_id\" value=\"(\d+)\""
targetList = re.findall(rTarget_id, page)
targetId = targetList[0]
rAlbum_id = "id=\"album_id\" value=\"(\d+)\""
albumList = re.findall(rAlbum_id, page)
albumId = albumList[0]
picUrl = "http://www.meitu.com/photos/ajaxDetail?id=" + targetId+ "&album_id=" + albumId + "&direction="
return picUrl
def parsePicUrl(self, page):
rFilterPic = "\"thumb_pic\":\"(http:.+thumbnail\d{3}\d+)"
finalPicUrlList = re.findall(rFilterPic, page)
if len(finalPicUrlList):
finalPicUrl = finalPicUrlList[0]
finalPicUrl = "".join(finalPicUrl.split("\\"))
print finalPicUrl
else:
finalPicUrl = ""
RuntimeError("Can't find picture url in page!")
return finalPicUrl
def go(self):
self.getPage(self.url)
print "Get '%s' page is OK!" % self.url
self.parseUrl()
print "Parse '%s' is OK" % self.url
i = 0
for qUrl in self.q:
if (i == 10):
break
i += 1
qPage = self.getPage(qUrl)
print "Get '%s' page is OK!" % qUrl
picUrl = self.parsePicAndGetUrl(qPage)
print "Parse '%s' dynamic url is OK!" % picUrl
picPage = self.getPage(picUrl)
print "Get '%s' page is OK!" % picUrl
finalPicUrl = self.parsePicUrl(picPage)
print "Get finalPicUrl '%s' is OK" % finalPicUrl
if not qUrl:
print "url '%s' is empty" % qUrl
else:
picRetrive = Retrieve(finalPicUrl)
picRetrive.download()
print "DownLoad all pictures OK! Total is ", len(self.q)
#################开始#########################
if __name__ == '__main__':
url = "http://www.meitu.com"
mySpider = spider(url)
mySpider.go()
纯属交流贴,欢迎拍砖,下载时注意数量限制,一直爬下去说不定会封ip,呵呵 |
|