- 论坛徽章:
- 0
|
我的代码如下,目的是爬取某网站上的数据,但是,我发现使用urllib2.open()打开网页后,无法读取内容,即:read()为空。
但是urlopen已经执行成功,并且返回code:200
我不知道是不是因为用爬虫爬取数据,所以他不给返回了,跪请大神指点
- # encoding: utf-8
- import json
- import urllib
- import urllib2
- import re
- import sys
- reload(sys)
- sys.setdefaultencoding( "utf-8" )
- def getproductID(productID, page):
- url = "https://club.jd.com/comment/productPageComments.action?productId=" + productID + "&score=0&sortType=5&page=" + str(page) + "&pageSize=10&isShadowSku=0"
- user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36'
- headers = {'User-Agent': user_agent}
- req = urllib2.Request(url)
- req.add_header('User-Agent', user_agent)
- try:
- response = urllib2.urlopen(req)
- except urllib2.URLError, e:
- if hasattr(e,"reason"):
- print "Page %d Failed to connect the server!" % page
- print "The reason is: %s" % e.reason
- return
- elif hasattr(e, 'code'):
- print "Error code is: %s" % e.code
- return
- data = response.read()
- html = data.decode("gbk", "ignore").encode("utf-8")
- comment_file = "comments.txt"
- if data:
- j = json.loads(html)
- if len(j["comments"]) == 0:
- print "Job is done!"
- print "Total of pages are: %d" % page
- else:
- for i in range(len(j["comments"])):
- with open(comment_file, 'a') as fh:
- fh.write(j["comments"][i]["nickname"] + ':')
- fh.write(j["comments"][i]["content"] + '\n')
- else:
- with open("tmp.txt", 'a') as tmp:
- tmp.write(str(page) + ": " + data + '\n')
- with open("tmp_comment.txt", 'a') as tmp:
- tmp.write(str(page) + ": " + html + '\n')
- with open(comment_file, 'a') as fh:
- fh.write('Page: ' + str(page) + 'Code: ' + str(response.getcode()) + '\n')
- #fh.write(response.info())
- fh.write('\n')
- if __name__ == '__main__':
- for i in range(100000):
- getproductID('2967927', i)
复制代码
|
|