- 论坛徽章:
- 0
|
可以从这里下载EXE版了 [url=http://bbs3.chinaunix.net/thread-1651077-1-1.html]http://bbs3.chinaunix.net/thread-1651077-1-1.html[/url]
下面是初期的试验版本,可以用来参考.
# -*- coding: gbk -*-
import urllib
import urllib2
import cookielib
import re
import os
from BeautifulSoup import BeautifulSoup
#--------------------------------------------------
#author :空气人儿
#blog :[url]http://hi.baidu.com/[/url]空气人儿
#last update :10/12/2009
#Tested in :Vista python v2.6.4
#TO DO : a lot
#version :v.1.0
#--------------------------------------------------
#main函数
#-----------
def main():
global target_dir
global n_perpage
#---------------------------------
#参数初始化
#---------------------------------
blog_username = " " #blog 用户名
blog_password = " " #blog 密码
target_dir = 'D:\\tempbackup\\' #blog 文件备份位置
n_perpage = 15 #blog 每页显示文章数,这个是在百度blog设置里进行设置
#---------------------------------
#get cookies and saved in opener.open
#---------------------------------
cookie = cookielib.CookieJar()
global opener
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
str = urllib.urlencode({'username':blog_username,'password':blog_password,'mem_pass':'on'})
login_response = opener.open('http://passport.baidu.com/?login',str) #cookie保存在opener.open,后面要用到
aaa = Getblog_category(blog_username)
bbb = Getblog_categoryhash(aaa)
Getblog_all(bbb)
#------------------------------------
#得到blog首页分类名称的集,返回resultset类(1-1)
#------------------------------------
def Getblog_category(blog_username):
f = opener.open("http://hi.baidu.com/%s"%blog_username)
bloghtml = f.read()
f.close()
bloghtm=unicode(bloghtml,'gb2312','ignore').encode('utf-8','ignore') #对blog首页内容转码为utf-8
bloghtm_soup=BeautifulSoup(bloghtm)
category_temp = bloghtm_soup.findAll("div",{"id":"m_artclg"})
category = category_temp[0].findAll("div",{"class":"item"})
return category
#-------------------------------
#得到首页分类名称和地址链接的hash(1-2)
#-------------------------------
def Getblog_categoryhash(category):
categoryhash = {}
for i in range(len(category)):
category_a_content = category[i].a
category_totalblog = int(category[i].contents[1][1:-1]) #每分类文章总数
print "category_totalblog=%s"%category_totalblog
category_directoryname = ''.join(category_a_content.contents).encode('gbk')
p =re.compile('[\\\/:*?"<>|]') #为了起文件夹名,过滤特殊字符
category_directoryname = p.sub('',category_directoryname)
category_directory = target_dir + category_directoryname
if not os.path.exists(category_directory):
os.mkdir(category_directory) #创建分类文件夹
print "创建文件夹成功",category_directory
categoryhash[category_directoryname] = category_a_content.attrs[0][1].encode('gbk')
if category_totalblog >= n_perpage:
biaohao = int(category_totalblog/n_perpage)
while(biaohao >=1):
linkend = "/index/%s"%biaohao
print "biaohao = %s"%biaohao
categoryhash[category_directoryname+str(biaohao)] = category_a_content.attrs[0][1].encode('gbk')+linkend
category_directory = target_dir + category_directoryname+str(biaohao)
if not os.path.exists(category_directory):
os.mkdir(category_directory)
print "创建文件夹成功",category_directory
biaohao = biaohao - 1
return categoryhash
#-------------------------------------
#得到第二层第一页的所有地址链接(2-1)
#-------------------------------------
def Getblog_subcategory(v):
h = opener.open("http://hi.baidu.com/%s"%v) #打开每个分类的地址
# print h
bloghtml2 = h.read()
h.close()
bloghtm2 = unicode(bloghtml2,'gb2312','ignore').encode('utf-8','ignore')
bloghtm2_soup = BeautifulSoup(bloghtm2)
category2_temp = bloghtm2_soup.findAll("div",{"id":"m_blog"})
category2 = category2_temp[0].findAll("div",{"class":"tit"})
print "category2 is long ",len(category2)
return category2
#-------------------------------------
#得到子类第一页文件名和地址的hash表(2-2)
#-------------------------------------
def Getblog_subhash(category2):
category2hash = {}
for i in range(len(category2)):
category2_titletemp = category2[i].a
if category2_titletemp is not None:
category2_title = ''.join(category2_titletemp.contents).encode('gbk')
category2hash[category2_title] = category2_titletemp.attrs[0][1].encode('gbk')
return category2hash
#-------------------------------------
#得到子类第一页文章内容(2-3)
#-------------------------------------
def Getblog_content2(category2hash,k):
for kk,vv in category2hash.items():
hh = opener.open("http://hi.baidu.com/%s"%vv)
bloghtml =hh.read()
hh.close()
bloghtm = unicode(bloghtml,'gb2312','ignore').encode('utf-8','ignore')
bloghtm_soup = BeautifulSoup(bloghtm)
bloghtmcontent0 = bloghtm_soup.findAll("div",{"id":"m_blog"})
bloghtmcontent = bloghtmcontent0[0].findAll("table") #得到文章内容
p =re.compile('[\\\/:*?"<>|]')
kk = p.sub('',kk)
hfile = open("%s%s.html"%((target_dir+k+'\\'),kk),"wb")
hfile.write(str(bloghtmcontent[0]))
print "创建%s%s.html成功"%((target_dir+k+'\\'),kk)
hfile.close()
#---------------------------------
#blog备份的函数(2-main)
#---------------------------------
def Getblog_all(categoryhash):
for k,v in categoryhash.items():
subcategory = Getblog_subcategory(v)
hash2 = Getblog_subhash(subcategory)
Getblog_content2(hash2,k)
#--------
#主函数
#--------
if __name__ == '__main__':
main()
[[i] 本帖最后由 check1234 于 2010-1-28 08:53 编辑 [/i]] |
|