- 论坛徽章:
- 0
|
第二十章例题crawl总是不能通过,根据出错提示又不能定位错误(新手),各位大侠能否给个调试过程攻略。最好给个PDB用法的,据说它是个通用法子。不过方法不限,能快速解决问题的法子就是好法子。
另,python关于调试方面的文章很少,讲解很浅,请各个推荐个系统的文章。下面是那个程序
#!/usr/bin/env python
from sys import argv
from os import makedirs, unlink , sep
from os.path import dirname, exists, isdir, splitext
from string import replace, find, lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse, urljoin
from formatter import DumbWriter, AbstractFormatter
from cStringIO import StringIO
class Retriever(object): # download Web pages
def __init__(self, url):
self.url = url
self.file = self.filename(url)
def filename(self,url,deffile = 'index.htm'):
parsedurl = urlparse(url,'http:',0) ##parse path
path = parsedurl[1] + parsedurl[2]
ext = splitext(path)
if ext[1] == '': #no file ,use default
if path[-1] == '/':
path += deffile
else:
path +='/' + deffile
ldir = dirname(path) #local directory
if sep !='/': # os-indep.path separator
ldir = replace(ldir , '/',sep)
if not isdir(ldir): #create archive dir if nec.
if exists(ldir): unlink(ldir)
makedirs(ldir)
print ldir
return path
#print path
def download(self):
try:
retval= urlretrieve(self.url,self.file)
except IOError:
retval = ('*** ERROR: invalid URL "%s"' %\
self.url,)
return retval
def parseAndGetLinks(self): #parse HTML ,save links
self.parser = HTMLParser(AbstractFormatter(\
DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parser.close()
return self.parser.anchorlist
class Crawler(object): #manage entire crawling process
count = 0 # statim downloaded page counter
def __init__(self,url):
self.q = [url]
self.seen = []
self.dom = urlparse(url)[1]
def getPage(self, url):
r = Retriever(url)
retval = r.download()
print retval
if retval[0] == '*':
#error situation, do not parse print retval ,'...skipping parse'
print retval, '...skipping parse'
return
Crawler.count += 1
print '\n(', Crawler.count,')'
print 'URL:', url
print 'FILE:', retval[0]
self.seen.append(url)
links = r.parseAndGetLinks() #get and process links
for eachLink in links:
if eachLink[:4] !='http' and \
find(eachLink,'://') == -1:
eachLink = urljoin(url,eachLink)
print '* ',eachLink,
if find(lower(eachLink), 'mailto:') !=-1:
print '...discarded , mailto link'
continue
if eachLink not in self.seen:
if find(eachLink, self.dom) == -1:
print '...discarded , not in domain'
else:
if eachLink not in self.q:
self.q.append(eachLink)
print '...new , added to Q'
else:
print '...discarded , already in Q'
else:
print '...discarded , already processed.'
def go(self): #process links in queue
while self.q:
url = self.q.pop()
self.getPage(url)
def main():
if len(argv) > 1:
url = argv[1]
else:
try:
url = raw_input('Enter starting URL:')
except (KeyboarInterrupt ,EOFError):
url = ''
if not url : return
robot = Crawler(url)
robot.go()
if __name__=='__main__':
main() |
|