免费注册 查看新帖 |

Chinaunix

  平台 论坛 博客 文库
最近访问板块 发新帖
查看: 4129 | 回复: 2
打印 上一主题 下一主题

python 抓图片的两段程序 [复制链接]

论坛徽章:
0
跳转到指定楼层
1 [收藏(0)] [报告]
发表于 2011-04-28 15:07 |只看该作者 |倒序浏览
请教下有没有更智能,更一般化的方法。
  1. # -*- coding: utf-8 -*-
  2. #coding=utf-8
  3. #vim:fdm=marker:ts=2

  4. import os
  5. from os.path import join, getsize

  6. import re, os, shutil, sys, codecs

  7. streamWriter=codecs.lookup('utf-8')[-1]
  8. #sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')
  9. sys.stdout=streamWriter(sys.stdout)
  10. reload(sys)
  11. sys.setdefaultencoding('utf8')
  12. sys.path.append('.')
  13. sys.path.append('..')


  14. import string, time, hashlib, types, logging, json, pdb

  15. #import psyco
  16. #psyco.full()
  17. #psyco.profile()
  18. #psyco.log()
  19. #from psyco.classes import *

  20. #from xml.dom import minidom
  21. #import poplib
  22. #import email

  23. from lxml import etree

  24. from urllib2 import Request, urlopen, URLError, HTTPError
  25. import urllib, urllib2

  26. import decimal, datetime

  27. import socket
  28. import httplib
  29. import StringIO

  30. #import dumper
  31. import cookielib
  32. import lxml.html as HTML

  33. debug_info = 0
  34. import socket
  35. socket.setdefaulttimeout(60)
  36. httplib.HTTPConnection.debuglevel = 1

  37. class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
  38.     def http_error_301(self, req, fp, code, msg, headers):
  39.         result = urllib2.HTTPRedirectHandler.http_error_301(
  40.             self, req, fp, code, msg, headers)
  41.         result.status = code
  42.         return result

  43.     def http_error_302(self, req, fp, code, msg, headers):
  44.         # if (req.get_host() == 'container.open.taobao.com'):
  45.         #return
  46.         result = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
  47.         result.status = code
  48.         return result

  49. class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
  50.         def http_error_default(self, req, fp, code, msg, headers):
  51.                 #print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ,  code,  headers['Location']
  52.                 result = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
  53.                 result.status = code
  54.                 return result

  55. # =====================================================================================================================================

  56. domain = "wallpaper.pconline.com.cn"
  57. #domain = "219.136.245.183:80"
  58. root_url = "http://" + domain + "/"
  59. # 3 is
  60. img_url1 = root_url + "cate_latest/5/$page_number$.html"

  61. xdoc_opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()), SmartRedirectHandler(), DefaultErrorHandler())
  62. xdoc_opener.handle_open["http"][0].set_http_debuglevel(debug_info)
  63. urllib2.install_opener(xdoc_opener)

  64. vfold = "f:/siteimage/"

  65. def getitem(url, xpath):
  66.         global xdoc_opener
  67.         root_request = urllib2.Request(url)
  68.         root_page = xdoc_opener.open(root_request)
  69.         items_list_html = root_page.read()
  70.         # items_list_html = items_list_html.decode("gbk").encode("UTF-8")
  71.         lxml  = HTML.document_fromstring(items_list_html)
  72.         nodes = lxml.getroottree().xpath(xpath)
  73.         return nodes
  74. # =====================================================================================================================================

  75. # every image list html
  76. for page_number in range(1, 15):

  77.         list_url = img_url1.replace("$page_number$", str(page_number))
  78.         list_xpath = "/html/body/div[3]/div/div[2]/ul/li[*]/p/span/a/@href"

  79.         nodes1 = getitem(list_url, list_xpath)
  80.         print str(page_number) + " 1: " + list_url
  81.         for img_url2 in nodes1:

  82.                 # conve to xtree
  83.                 #img_url2 = root_url + "photo/3591.html"
  84.                 single_pageit = "id('scroll')/ul/li/p/a/@href"
  85.                 #single_pageit = "/html/body/div[3]/div/div/div[2]/div[2]/ul"

  86.                 nodes = getitem(img_url2, single_pageit)
  87.                 print "\t\t 2: " + img_url2 + "   " + str(len(nodes)) + r"项"
  88.                 print

  89.                 if (len(nodes) < 1):
  90.                         continue

  91.                 subfold = img_url2.split("/")[-1]
  92.                 if (not os.path.isdir(vfold + subfold)):
  93.                         os.mkdir(vfold + subfold)

  94.                 # every image html
  95.                 for inode in nodes:
  96.                         img_single_pageit = "/html/body/table/tr[2]/td/img/@src"

  97.                         sub_img_url = root_url + inode.replace("../", "")
  98.          
  99.                         inodes = getitem(sub_img_url, img_single_pageit)
  100.                        
  101.                         file_name = inodes[0].split("/")[-1]

  102.                         filelist = os.listdir(vfold + subfold + "/")
  103.                         if (filelist.count(file_name) > 0):
  104.                                 #print "\t\t\t is exits " + file_name + "."
  105.                                 continue

  106.                         print "\t\t\t 3: " + sub_img_url
  107.                         if (len(inodes) != 1):
  108.                                 continue
  109.                         drc_img_url =  urllib2.Request(inodes[0])
  110.                         root_page = xdoc_opener.open(drc_img_url)
  111.                         img = root_page.read()
  112.                         f = file( vfold + subfold + "/" + file_name , "wb")
  113.                         f.write(img)
  114.                         f.close()
  115.                         print "\t\t\t save pic " + vfold + subfold + "/" + file_name

  116.         #break

复制代码

  1. # -*- coding: utf-8 -*-
  2. #coding=utf-8
  3. #vim:fdm=marker:ts=2

  4. import os
  5. from os.path import join, getsize

  6. import re, os, shutil, sys, codecs

  7. streamWriter=codecs.lookup('utf-8')[-1]
  8. #sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')
  9. sys.stdout=streamWriter(sys.stdout)
  10. reload(sys)
  11. sys.setdefaultencoding('utf8')
  12. sys.path.append('.')
  13. sys.path.append('..')


  14. import string, time, hashlib, types, logging, json, pdb

  15. #import psyco
  16. #psyco.full()
  17. #psyco.profile()
  18. #psyco.log()
  19. #from psyco.classes import *

  20. #from xml.dom import minidom
  21. #import poplib
  22. #import email

  23. from lxml import etree

  24. from urllib2 import Request, urlopen, URLError, HTTPError
  25. import urllib, urllib2

  26. import decimal, datetime

  27. import socket
  28. import httplib
  29. import StringIO

  30. #import dumper
  31. import cookielib
  32. import lxml.html as HTML

  33. debug_info = 0

  34. socket.setdefaulttimeout(60)
  35. httplib.HTTPConnection.debuglevel = 1

  36. class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
  37.     def http_error_301(self, req, fp, code, msg, headers):
  38.         result = urllib2.HTTPRedirectHandler.http_error_301(
  39.             self, req, fp, code, msg, headers)
  40.         result.status = code
  41.         return result

  42.     def http_error_302(self, req, fp, code, msg, headers):
  43.         # if (req.get_host() == 'container.open.taobao.com'):
  44.         #return
  45.         result = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
  46.         result.status = code
  47.         return result

  48. class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
  49.         def http_error_default(self, req, fp, code, msg, headers):
  50.                 #print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' ,  code,  headers['Location']
  51.                 result = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
  52.                 result.status = code
  53.                 return result

  54. # =====================================================================================================================================

  55. domain = "wallpaper.pconline.com.cn"
  56. root_url = "http://" + domain + "/"
  57. # 3 is
  58. img_url = root_url + "/cate_latest/5/$page_number$.html"

  59. xdoc_opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()), SmartRedirectHandler(), DefaultErrorHandler())
  60. xdoc_opener.handle_open["http"][0].set_http_debuglevel(debug_info)
  61. urllib2.install_opener(xdoc_opener)

  62. def getitem(url, xpath, xpath1 = "",  xpath2= ""):
  63.         global xdoc_opener
  64.         root_page = xdoc_opener.open(urllib2.Request(url))
  65.         items_list_html = root_page.read()
  66.         # items_list_html = items_list_html.decode("gbk").encode("UTF-8")
  67.         # print items_list_html
  68.         lxml  = HTML.document_fromstring(items_list_html)
  69.         nodes = lxml.getroottree().xpath(xpath)

  70.         if (xpath1 != ""):
  71.                 nodes.extend(lxml.getroottree().xpath(xpath1))
  72.         if (xpath2 != ""):
  73.                 nodes.extend(lxml.getroottree().xpath(xpath2))


  74.         return nodes
  75. # =====================================================================================================================================


  76. domain = "www.qqqse.com"
  77. root_url = "http://" + domain + ""
  78. img_url = root_url + "/html/tupianqu/yazhousetu/list_5_$page_number$.html"

  79. vfold = "f:/x2/"


  80. for page_number in range(1, 60):
  81.         list_url = img_url.replace("$page_number$", str(page_number))
  82.         list_xpath = "/html/body/div[3]/div[2]/table/tr[1]/td/div/li[*]/a/@href"
  83.         nodes1 = getitem(list_url, list_xpath)

  84.         print "1: " + list_url
  85.         if (len(nodes1) < 1):
  86.                 print "no item " + str(len(nodes1))
  87.                 break

  88.         for img_list in nodes1:
  89.                 img_list_xpath2 = "/html/body/div[3]/div[2]/table/tr/td/div/div[2]/table/tr/td/img/@src"
  90.                 img_list_xpath1 = "/html/body/div[3]/div[2]/table/tr/td/div/div[2]/table/tr/td/blockquote/img/@src"
  91.                 img_list_xpath  = "/html/body/div[3]/div[2]/table/tr/td/div/div[2]/table/tr/td/div/div/img/@src"

  92.                 img_nodes = getitem(root_url + img_list, img_list_xpath, img_list_xpath1, img_list_xpath2)
  93.                 if (len(img_nodes) < 1):
  94.                         print "no items 2: " + str(len(img_nodes))
  95.                         break

  96.                 print "\t2: " + img_list + " " + str(len(img_nodes))
  97.                 subfold = img_list.split("/")[-1]
  98.                 if ("41517.html" == subfold):
  99.                         continue
  100.                 if (not os.path.isdir(vfold + subfold)):
  101.                         os.mkdir(vfold + subfold)

  102.                 for uimg in img_nodes:
  103.                         file_name = uimg.split("/")[-1]
  104.                         print "\t\t3: " + uimg

  105.                         filelist = os.listdir(vfold + subfold + "/")
  106.                         if (filelist.count(file_name) > 0):
  107.                                 #print "\t\t\t is exits " + file_name + "."
  108.                                 continue                       
  109.                         try:
  110.                                 root_page = xdoc_opener.open(urllib2.Request(uimg))
  111.                                 img = root_page.read()
  112.                                 f = file(vfold + subfold + "/" + file_name, "wb")
  113.                                 f.write(img)
  114.                                 f.close()
  115.                                 print "\t\t\t save pic " + vfold + subfold + "/" + file_name
  116.                         except :
  117.                                 # print sys.exec_info()
  118.                                 continue
  119.                         #break
  120.                 #break
复制代码

论坛徽章:
0
2 [报告]
发表于 2011-04-28 15:08 |只看该作者
晕, 这个代码不能宽一点么。 伤心。

论坛徽章:
0
3 [报告]
发表于 2011-04-28 16:30 |只看该作者
有什么方法处理html中的javascript代码?有些网站是将图片地址放在javascript中的
您需要登录后才可以回帖 登录 | 注册

本版积分规则 发表回复

  

北京盛拓优讯信息技术有限公司. 版权所有 京ICP备16024965号-6 北京市公安局海淀分局网监中心备案编号:11010802020122 niuxiaotong@pcpop.com 17352615567
未成年举报专区
中国互联网协会会员  联系我们:huangweiwei@itpub.net
感谢所有关心和支持过ChinaUnix的朋友们 转载本站内容请注明原作者名及出处

清除 Cookies - ChinaUnix - Archiver - WAP - TOP