论坛徽章:: 0

电梯直达

1楼 [收藏(0)] [报告]

发表于 2013-01-09 09:28 |只看该作者 |倒序浏览

#! /usr/bin/env python
# -*- coding = utf-8 -*-
from time import sleep
class WeiboSpider:
'''
crawl sina weibo comments
'''
def __init__(self, user_id):
self.user_id = user_id
def delay_time(self):
'''
limit to access sina server delay time
'''
sleep(60)
def get_friendships(self, user_id):
'''
get user's(uid=user_id) following list
default setting is 500
'''
self.user_id = user_id
friendship = client.get.friendships__friends__ids(uid=user_id)
uid_set = []
for each_uid in friendship.ids:
uid_set.append(each_uid)
return uid_set
def get_user_info(self, user_id):
'''
get user id=user_id basic information
'''
self.user_id = user_id
user_info = client.get.user__show(uid=user_id)
user = []
user.append(user_info.id)
user.append(user_info.screen_name)
user.append(user_info.gender)
return user
def get_statuses(self, user_id):
'''
get 50 statuses from user id=user_id
'''
self.user_id = user_id
status = client.get.statuses__user_timeline(uid=user_id, count=50)
status_id = []
status_writer = []
status_time = []
status_text = []
status_comments = []
for each_status in status.statuses:
status_id.append(each_status.id)
status_writer.append(each_status.user.screen_name)
status_time.append(each_status.created_at)
status_text.append(each_status.text)
status_comments.append(each_status.comments_count)
return (status_id, status_writer, status_time, status_text, status_comments)
def get_comments(self, status_id):
'''
get 50 comments on statuses id=status_id
'''
self.status_id = status_id
comment = client.get.comments__show(id=status_id, count=50)
comment_writer_id = []
comment_writer = []
comment_time = []
comment_text = []
for each_comment in comment.comments:
comment_writer_id.append(each_comment.user.id)
comment_writer.append(each_comment.user.screen_name)
comment_time.append(each_comment.created_at)
comment_text.append(each_comment.text)
return (comment_writer_id, comment_writer, comment_time, comment_text)
if __name__ == "__main__":
from client import Client
client = Client()
client = client.set_client()
user_id = raw_input("Please input user's id: ")
spider = WeiboSpider(user_id)
uid_set = spider.get_friendships(user_id)
out_file = open('comments.dat', 'w')
for each_uid in uid_set:
(temp_status_id, temp_status_writer, temp_status_time,
temp_status_text, temp_status_comments) = spider.get_statuses(each_uid)
print "number of statuses before filter: %d" % len(temp_status_id)
status_id = []
status_writer = []
status_time = []
status_text = []
status_comments = []
for k in range(len(temp_status_id)):
if temp_status_comments[k] != 0:
status_id.append(temp_status_id[k])
status_writer.append(temp_status_writer[k])
status_time.append(temp_status_time[k])
status_text.append(temp_status_text[k])
status_comments.append(temp_status_comments[k])
print "number of statuses after filter: %d" % len(status_id)
for i in range(len(status_id)):
print ">Grabbing data of: %s" % status_writer[i]
writer = status_writer[i].encode('utf-8')
spider.delay_time()
(comment_writer_id, comment_writer, comment_time,
comment_text) = spider.get_comments(status_id[i])
for j in range(len(comment_text)):
if comment_writer_id[j] == int(user_id):
name = comment_writer[j].encode('utf-8')
time = comment_time[j].encode('utf-8')
text = comment_text[j].encode('utf-8')
out_file.write(name + ' ' + writer + ' ' +
time + ' ' + text + '\n')
out_file.close()

复制代码

这是个新浪微薄的爬虫程序抓取某个用户在其所关注用户上的微薄的评论（突发奇想就写了个。。。。）

由于新浪的限制（我用了官方的API 太复杂的爬虫不会写）

访问时间间隔设置了60秒所以要爬很长时间

可是运行几个小时后就弹出 KeyError: 'user'

这样的

Traceback (most recent call last):
File "spider.py", line 112, in <module>
temp_status_text, temp_status_comments) = spider.get_statuses(each_uid)
File "spider.py", line 68, in get_statuses
status_writer.append(each_status.user.screen_name)
File "weibo.py", line 50, in __getattr__
return self[attr]

复制代码

weibo.py是官方的SDK
spider.py是我上面的代码

没查出原因新浪server的原因么？
导致 user这个变量没值？

文库|博客

你还未够水准呢

稍有积蓄

论坛徽章:: 0

2楼 [报告]

发表于 2013-01-09 09:44 |只看该作者

本帖最后由你还未够水准呢于 2013-01-09 09:45 编辑

初步学python 代码不足之处还望 @linux_c_py_php 指正

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

darkn3ss

稍有积蓄

论坛徽章:: 0

3楼 [报告]

发表于 2013-01-09 10:35 |只看该作者

官方api也不是万能的，新浪微博的python api，貌似是第三方维护的，不是新浪官方，新浪的官方语言是php

所以，feel free去weibo.py里面排错吧，python的调试信息说的蛮清楚了，连行数都标出来了，自己动手丰衣足食

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

你还未够水准呢

稍有积蓄

论坛徽章:: 0

4楼 [报告]

发表于 2013-01-10 14:28 |只看该作者

bug 找到了有的微薄被删掉了但是依然从服务器返回但是返回了空的列表所以就append出错

个人猜测。。。。。添加了判断语句

实战分享：从技术角度谈机器学习入门| 【大话IT】RadonDB低门槛向MySQL集群下战书 | ChinaUnix打赏功能已上线！ | 新一代分布式关系型数据库RadonDB知多少？

返回列表

Chinaunix › 论坛 › 程序设计 › Python › 有个新浪微薄爬虫的问题 KeyError

有个新浪微薄爬虫的问题 KeyError [复制链接]

浏览过的版块