- 论坛徽章:
- 0
|
通过正则表达式提取百度搜索结果
package test;
import java.io.*;
import java.util.*;
import java.text.*;
import java.net.*;
import java.util.regex.*;
/**
* 提取百度搜索结果标题链接摘要
* @author 静止的流水
*
*/
public class GetBaiduInfor
{
/**
* 下载页面源码
* @param urlString
* @return
*/
public static String getHtml(String urlString) {
try {
StringBuffer html = new StringBuffer();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream());
BufferedReader br = new BufferedReader(isr);
String temp;
while ((temp = br.readLine()) != null) {
html.append(temp).append("\n");
}
br.close();
isr.close();
return html.toString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* 字符串查找
* @param expression 正则表达式字符串
* @param text 要进行查找操作的字符串
* @param str 要查找的字符串
*/
private static void findText(String expression, String text, String str) {
Pattern p = Pattern.compile(expression); // 正则表达式
Matcher m = p.matcher(text); // 操作的字符串
StringBuffer sb = new StringBuffer();
int i = 0;
while (m.find()) {
m.appendReplacement(sb, str);
i++;
}
m.appendTail(sb);
System.out.println(sb.toString());
System.out.println(i);
}
/**
* 提取百度结果标题连接摘要
* @param html
*/
public static void baiduparser(String html)
{
Pattern pattern = Pattern.compile("([\\s\\S]+?)([\\s\\S]+?)
/**
* 主函数
* @param argc
* @throws Exception
*/
public static void main(String[] argc)throws Exception
{
System.err.println("Now Let's Go!");
jiecheng.baiduparser(jiecheng.getHtml("
http://www.baidu.com/s?wd
=中国"));
System.err.println("Good bye!");
}
}
本文来自ChinaUnix博客,如果查看原文请点:http://blog.chinaunix.net/u2/80678/showart_2032579.html |
|