- 论坛徽章:
- 0
|
请教下有没有更智能,更一般化的方法。- # -*- coding: utf-8 -*-
- #coding=utf-8
- #vim:fdm=marker:ts=2
- import os
- from os.path import join, getsize
- import re, os, shutil, sys, codecs
- streamWriter=codecs.lookup('utf-8')[-1]
- #sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')
- sys.stdout=streamWriter(sys.stdout)
- reload(sys)
- sys.setdefaultencoding('utf8')
- sys.path.append('.')
- sys.path.append('..')
- import string, time, hashlib, types, logging, json, pdb
- #import psyco
- #psyco.full()
- #psyco.profile()
- #psyco.log()
- #from psyco.classes import *
- #from xml.dom import minidom
- #import poplib
- #import email
- from lxml import etree
- from urllib2 import Request, urlopen, URLError, HTTPError
- import urllib, urllib2
- import decimal, datetime
- import socket
- import httplib
- import StringIO
- #import dumper
- import cookielib
- import lxml.html as HTML
- debug_info = 0
- import socket
- socket.setdefaulttimeout(60)
- httplib.HTTPConnection.debuglevel = 1
- class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
- def http_error_301(self, req, fp, code, msg, headers):
- result = urllib2.HTTPRedirectHandler.http_error_301(
- self, req, fp, code, msg, headers)
- result.status = code
- return result
- def http_error_302(self, req, fp, code, msg, headers):
- # if (req.get_host() == 'container.open.taobao.com'):
- #return
- result = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
- result.status = code
- return result
- class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
- def http_error_default(self, req, fp, code, msg, headers):
- #print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' , code, headers['Location']
- result = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
- result.status = code
- return result
- # =====================================================================================================================================
- domain = "wallpaper.pconline.com.cn"
- #domain = "219.136.245.183:80"
- root_url = "http://" + domain + "/"
- # 3 is
- img_url1 = root_url + "cate_latest/5/$page_number$.html"
- xdoc_opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()), SmartRedirectHandler(), DefaultErrorHandler())
- xdoc_opener.handle_open["http"][0].set_http_debuglevel(debug_info)
- urllib2.install_opener(xdoc_opener)
- vfold = "f:/siteimage/"
- def getitem(url, xpath):
- global xdoc_opener
- root_request = urllib2.Request(url)
- root_page = xdoc_opener.open(root_request)
- items_list_html = root_page.read()
- # items_list_html = items_list_html.decode("gbk").encode("UTF-8")
- lxml = HTML.document_fromstring(items_list_html)
- nodes = lxml.getroottree().xpath(xpath)
- return nodes
- # =====================================================================================================================================
- # every image list html
- for page_number in range(1, 15):
- list_url = img_url1.replace("$page_number$", str(page_number))
- list_xpath = "/html/body/div[3]/div/div[2]/ul/li[*]/p/span/a/@href"
- nodes1 = getitem(list_url, list_xpath)
- print str(page_number) + " 1: " + list_url
- for img_url2 in nodes1:
- # conve to xtree
- #img_url2 = root_url + "photo/3591.html"
- single_pageit = "id('scroll')/ul/li/p/a/@href"
- #single_pageit = "/html/body/div[3]/div/div/div[2]/div[2]/ul"
- nodes = getitem(img_url2, single_pageit)
- print "\t\t 2: " + img_url2 + " " + str(len(nodes)) + r"项"
- print
- if (len(nodes) < 1):
- continue
- subfold = img_url2.split("/")[-1]
- if (not os.path.isdir(vfold + subfold)):
- os.mkdir(vfold + subfold)
- # every image html
- for inode in nodes:
- img_single_pageit = "/html/body/table/tr[2]/td/img/@src"
- sub_img_url = root_url + inode.replace("../", "")
-
- inodes = getitem(sub_img_url, img_single_pageit)
-
- file_name = inodes[0].split("/")[-1]
- filelist = os.listdir(vfold + subfold + "/")
- if (filelist.count(file_name) > 0):
- #print "\t\t\t is exits " + file_name + "."
- continue
- print "\t\t\t 3: " + sub_img_url
- if (len(inodes) != 1):
- continue
- drc_img_url = urllib2.Request(inodes[0])
- root_page = xdoc_opener.open(drc_img_url)
- img = root_page.read()
- f = file( vfold + subfold + "/" + file_name , "wb")
- f.write(img)
- f.close()
- print "\t\t\t save pic " + vfold + subfold + "/" + file_name
- #break
复制代码
- # -*- coding: utf-8 -*-
- #coding=utf-8
- #vim:fdm=marker:ts=2
- import os
- from os.path import join, getsize
- import re, os, shutil, sys, codecs
- streamWriter=codecs.lookup('utf-8')[-1]
- #sys.stdout = codecs.getwriter(locale.getdefaultlocale()[1])(sys.stdout, 'replace')
- sys.stdout=streamWriter(sys.stdout)
- reload(sys)
- sys.setdefaultencoding('utf8')
- sys.path.append('.')
- sys.path.append('..')
- import string, time, hashlib, types, logging, json, pdb
- #import psyco
- #psyco.full()
- #psyco.profile()
- #psyco.log()
- #from psyco.classes import *
- #from xml.dom import minidom
- #import poplib
- #import email
- from lxml import etree
- from urllib2 import Request, urlopen, URLError, HTTPError
- import urllib, urllib2
- import decimal, datetime
- import socket
- import httplib
- import StringIO
- #import dumper
- import cookielib
- import lxml.html as HTML
- debug_info = 0
- socket.setdefaulttimeout(60)
- httplib.HTTPConnection.debuglevel = 1
- class SmartRedirectHandler(urllib2.HTTPRedirectHandler):
- def http_error_301(self, req, fp, code, msg, headers):
- result = urllib2.HTTPRedirectHandler.http_error_301(
- self, req, fp, code, msg, headers)
- result.status = code
- return result
- def http_error_302(self, req, fp, code, msg, headers):
- # if (req.get_host() == 'container.open.taobao.com'):
- #return
- result = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
- result.status = code
- return result
- class DefaultErrorHandler(urllib2.HTTPDefaultErrorHandler):
- def http_error_default(self, req, fp, code, msg, headers):
- #print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' , code, headers['Location']
- result = urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
- result.status = code
- return result
- # =====================================================================================================================================
- domain = "wallpaper.pconline.com.cn"
- root_url = "http://" + domain + "/"
- # 3 is
- img_url = root_url + "/cate_latest/5/$page_number$.html"
- xdoc_opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()), SmartRedirectHandler(), DefaultErrorHandler())
- xdoc_opener.handle_open["http"][0].set_http_debuglevel(debug_info)
- urllib2.install_opener(xdoc_opener)
- def getitem(url, xpath, xpath1 = "", xpath2= ""):
- global xdoc_opener
- root_page = xdoc_opener.open(urllib2.Request(url))
- items_list_html = root_page.read()
- # items_list_html = items_list_html.decode("gbk").encode("UTF-8")
- # print items_list_html
- lxml = HTML.document_fromstring(items_list_html)
- nodes = lxml.getroottree().xpath(xpath)
- if (xpath1 != ""):
- nodes.extend(lxml.getroottree().xpath(xpath1))
- if (xpath2 != ""):
- nodes.extend(lxml.getroottree().xpath(xpath2))
- return nodes
- # =====================================================================================================================================
- domain = "www.qqqse.com"
- root_url = "http://" + domain + ""
- img_url = root_url + "/html/tupianqu/yazhousetu/list_5_$page_number$.html"
- vfold = "f:/x2/"
- for page_number in range(1, 60):
- list_url = img_url.replace("$page_number$", str(page_number))
- list_xpath = "/html/body/div[3]/div[2]/table/tr[1]/td/div/li[*]/a/@href"
- nodes1 = getitem(list_url, list_xpath)
- print "1: " + list_url
- if (len(nodes1) < 1):
- print "no item " + str(len(nodes1))
- break
- for img_list in nodes1:
- img_list_xpath2 = "/html/body/div[3]/div[2]/table/tr/td/div/div[2]/table/tr/td/img/@src"
- img_list_xpath1 = "/html/body/div[3]/div[2]/table/tr/td/div/div[2]/table/tr/td/blockquote/img/@src"
- img_list_xpath = "/html/body/div[3]/div[2]/table/tr/td/div/div[2]/table/tr/td/div/div/img/@src"
- img_nodes = getitem(root_url + img_list, img_list_xpath, img_list_xpath1, img_list_xpath2)
- if (len(img_nodes) < 1):
- print "no items 2: " + str(len(img_nodes))
- break
- print "\t2: " + img_list + " " + str(len(img_nodes))
- subfold = img_list.split("/")[-1]
- if ("41517.html" == subfold):
- continue
- if (not os.path.isdir(vfold + subfold)):
- os.mkdir(vfold + subfold)
- for uimg in img_nodes:
- file_name = uimg.split("/")[-1]
- print "\t\t3: " + uimg
- filelist = os.listdir(vfold + subfold + "/")
- if (filelist.count(file_name) > 0):
- #print "\t\t\t is exits " + file_name + "."
- continue
- try:
- root_page = xdoc_opener.open(urllib2.Request(uimg))
- img = root_page.read()
- f = file(vfold + subfold + "/" + file_name, "wb")
- f.write(img)
- f.close()
- print "\t\t\t save pic " + vfold + subfold + "/" + file_name
- except :
- # print sys.exec_info()
- continue
- #break
- #break
复制代码 |
|