- 论坛徽章:
- 0
|
# -*- coding: cp936 -*-
import urllib
page = 1
link = 1
url = ['']*350
while page <= 7:
con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read()
title = con.find(r'<a title')
href = con.find(r'href=',title)
html = con.find(r'.html',href)
i = 0
while title != -1 and href != -1 and html != -1 and i<50:
url[i] = con[href + 6:html + 5]
print link,'---',url[i]
title = con.find(r'<a title',html)
href = con.find(r'href=',title)
html = con.find(r'.html',href)
i = i + 1
link = link + 1
else:
print '第',page,'页' ,'read finished ......'
page = page + 1
else:
print 'all page find '
j = 0
while j < 350:
content = urllib.urlopen(url[j]).read()
print 'downloading......',url[j],j
open(r'韩寒/'+url[j][-26:],'w+').write(content)
j = j + 1
else :
print 'download finished'
这是下载韩寒博客全部文章的代码,但只能下载到第一页的50篇,后面的都下载不到了, 不知道为什么?
错误代码:Traceback (most recent call last):
File "C:\Python27\blog2.py", line 31, in <module>
content = urllib.urlopen(url[j]).read()
File "C:\Python27\lib\urllib.py", line 87, in urlopen
return opener.open(url)
File "C:\Python27\lib\urllib.py", line 208, in open
return getattr(self, name)(url)
File "C:\Python27\lib\urllib.py", line 463, in open_file
return self.open_local_file(url)
File "C:\Python27\lib\urllib.py", line 477, in open_local_file
raise IOError(e.errno, e.strerror, e.filename)
IOError: [Errno 2] : ''
求好心人解答,谢谢 |
|