- 论坛徽章:
- 0
|
是我在xiaosuo的CU博客上找到的.... 看到很多文章丢失的问题, 所以发上来
http://blog.chinaunix.net/u/5251/showart_265499.html
有个问题,就是第一行的路径不同的系统是不一样的,自己找一下你的python在哪里,没装的就装上 , 用命令whereis python查找...
执行就跟SHELL脚本一样的,
xiaosuo@gentux python $ ./cubk http://blog.chinaunix.net/u/5251/
代码如下:- #!/bin/env python
- # Copyright (c) xiaosuo <xiaosuo@gmail.com>
- # License GPL v2 or above.
- import sys
- import getopt
- import urllib2
- import re
- import urlparse
- import string
- import distutils.dir_util
- import htmllib
- import formatter
- class UrlQueue:
- def __init__(self, baseUrl):
- self.base = baseUrl[0:baseUrl.rfind("/") + 1]
- self.finished = []
- self.pending = []
- def hasPending(self):
- if len(self.pending) > 0:
- return True
- else:
- return False
- def baseUrl(self):
- return self.base
- def pop(self):
- url = self.pending.pop()
- self.finished.append(url)
- return url
- def append(self, url):
- absUrl = urlparse.urljoin(self.base, url)
- baseUrlLen = len(self.base)
- if len(absUrl) <= baseUrlLen or absUrl[0:baseUrlLen] != self.base:
- return False
- url = absUrl[baseUrlLen:]
- for e in self.finished:
- if (url == e):
- return False
- for e in self.pending:
- if (url == e):
- return False
- self.pending.append(url)
- return True
- class UrlFilter(htmllib.HTMLParser):
- def __init__(self):
- htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
- self.hrefList = []
- def anchor_bgn(self, href, name, type):
- self.hrefList.append(href)
- def clear(self):
- self.hrefList = []
- class Grabber:
- def __init__(self, initUrl):
- self.initUrl = initUrl
- self.urlQueue = UrlQueue(initUrl)
- self.urlFilter = UrlFilter()
- def run(self):
- self.urlQueue.append(self.initUrl)
- i = 0
- while True:
- listUrl = "article_0_%d.html" % i
- absUrl = urlparse.urljoin(self.urlQueue.baseUrl(), listUrl)
- print "Fetching %s" % absUrl
- page = urllib2.urlopen(absUrl).read()
- self.urlFilter.clear()
- self.urlFilter.feed(page)
- self.urlFilter.close()
- valid = False
- for url in self.urlFilter.hrefList:
- self.urlQueue.append(url)
- if url[0:8] == "showart_":
- valid = True
- if not valid:
- break
- file(listUrl, "w").write(page)
- i = i + 1
- while self._grab():
- pass
- def _grab(self):
- if not self.urlQueue.hasPending():
- return False
- url = self.urlQueue.pop()
- absUrl = urlparse.urljoin(self.urlQueue.baseUrl(), url)
- print "Fetching %s" % absUrl
- page = urllib2.urlopen(absUrl).read()
- pos = url.rfind("/")
- if pos != -1:
- distutils.dir_util.mkpath(url[0:pos])
- file(url, "w").write(page)
- pos = url.rfind(".")
- if pos == -1:
- return True
- if string.lower(url[pos+1:pos+4]) != "htm":
- return True
- self.urlFilter.clear()
- self.urlFilter.feed(page)
- self.urlFilter.close()
- for url in self.urlFilter.hrefList:
- self.urlQueue.append(url)
- return True
- def showHelp(prg):
- print "CUBlog backup script.\n"
- print "Usage: %s [option]... initUrl" % prg
- print "Options:"
- print " -h, --help Show the help information"
- if __name__ == "__main__":
- baseUrl = "";
- # parse the arguments
- try:
- (opts, args) = getopt.getopt(sys.argv[1:], "h", \
- ["help"])
- except getopt.GetoptError:
- print "Wrong command line arguments."
- showHelp(sys.argv[0])
- sys.exit(1)
- for (o, a) in opts:
- if o in ("-h", "--help"):
- showHelp(sys.argv[0])
- sys.exit(0)
- if len(args) == 0:
- showHelp(sys.argv[0])
- sys.exit(1)
- url = args[0]
- if url.rfind("/") == len(url) - 1:
- url += "index.html"
- Grabber(url).run()
复制代码
当然可以另外写个SHELL程序,免得每次都要去输入你的网址,如下:
- #!/bin/sh
- #只要注意你的路径就对了,很简单吧,但方便了许多
- ./cubk http://blog.chinaunix.net/u/5251/
复制代码
[ 本帖最后由 zuii 于 2008-9-2 13:58 编辑 ] |
|