- 论坛徽章:
- 0
|
本帖最后由 whosy 于 2012-05-15 17:50 编辑
回复 2# anonymous0502
对啊,直接在IDLE中填一个字符串来测试是没有问题的,但是一旦到程序中就有问题了。
把我写的程序贴出来一部分吧:- import re
- import http.client
- import logging
- import os
- from os.path import join
- Error = (
- {
- 0 : "ok" ,
- 1 : "error" ,
- 2 : "connection failed" ,
- 3 : "decode failed" ,
- 4 : "file operate error"
- }
- )
- class HttpDownload:
- def __init__(self, hostname = r"127.0.0.1"):
- self.hostname = hostname
- #创建正则表达式对象,用来抽取网页中的代码
- self.Rex1 = re.compile(r'(?<=sourcefirst).+(?=sourcelast)',re.DOTALL)
- self.Rex2 = re.compile(r'(?<=</a>)[^<>]+(?=</div>)', re.IGNORECASE)
-
- def Start(self, HttpPath, DstPath):
- if os.path.exists(DstPath):
- return 0
- #连接服务器
- conn = http.client.HTTPConnection(self.hostname)
-
- #将路径中的空格替换为%20编码
- path = HttpPath.replace(r" ",r"%20")
- print("path = " + path)
- conn.request("GET",path)
- response = conn.getresponse()
- #若URL请求不成功,则返回错误
- print("response.status = " , response.status)
- if response.status != 200:
- conn.close()
- return 2
-
- content = response.read()
- conn.close()
- try:
- content = content.decode("utf-8")
- encode = "utf-8"
- except UnicodeDecodeError:
- content = content.decode("latin_1")
- encode = "latin_1"
- except Exception:
- return 3
-
- string = str(content)
- #第一次匹配
- match = self.Rex1.search(string)
- if not match:
- return 0
-
- string = str(match.group())
- #替换字符
- #string = string.replace("<","<")#就这两行出现问题
- #string = string.replace(">",">")
- #第二次匹配
- codes = self.Rex2.findall(string)
- if not codes:
- print("No codes")
- #保存文件
- if DstPath:
- fd = open(file = DstPath, mode = 'w', encoding = encode)
- for i in codes:
- fd.write(i)
- fd.write('\r\n')
- fd.close()
- return 0
复制代码 求教! |
|