- 论坛徽章:
- 0
|
package test;
import java.io.*;
import java.util.*;
import java.text.*;
import java.net.*;
import java.util.regex.*;
/**
* 提取百度搜索结果标题链接摘要
* @author 静止的流水
*
*/
public class GetBaiduInfor
{
/**
* 下载页面源码
* @param urlString
* @return
*/
public String getHtml(String urlString) {
try {
StringBuffer html = new StringBuffer();
URL url = new URL(urlString);
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
InputStreamReader isr = new InputStreamReader(conn.getInputStream());
BufferedReader br = new BufferedReader(isr);
String temp;
while ((temp = br.readLine()) != null) {
html.append(temp).append("\n");
}
br.close();
isr.close();
return html.toString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
/**
* 字符串查找
* @param expression 正则表达式字符串
* @param text 要进行查找操作的字符串
* @param str 要查找的字符串
*/
private void findText(String expression, String text, String str) {
Pattern p = Pattern.compile(expression); // 正则表达式
Matcher m = p.matcher(text); // 操作的字符串
StringBuffer sb = new StringBuffer();
int i = 0;
while (m.find()) {
m.appendReplacement(sb, str);
i++;
}
m.appendTail(sb);
System.out.println(sb.toString());
System.out.println(i);
}
/**
* 提取百度结果标题连接摘要
* @param html
*/
public void baiduparser(String html)
{
Pattern pattern = Pattern.compile("([\\s\\S]+?)([\\s\\S]+?));
Matcher m = pattern.matcher(html);
int i = 1;
while(m.find())
{
System.out.println(i);
//System.out.println(m.group());
System.out.println("href = "+m.group(1));
String chinacode = jiecheng.china(m.group(2));
System.out.println("Title = "+chinacode);
String chinacode3 = jiecheng.china(m.group(3));
System.out.println("Description = "+chinacode3);
i++;
}
System.out.println("m.groupCount = "+m.groupCount());
}
/**
* 判断中文字符
* @param s
* @return
*/
public String china(String s)
{
StringBuilder sb = new StringBuilder();
for(int i=0;is.length();i++)
{
if(s.substring(i, i+1).matches("[\u4e00-\u9fa5]"))
{
sb.append(s.subSequence(i, i+1));
}
}
return sb.toString();
}
/**
* 主函数
* @param argc
* @throws Exception
*/
public static void main(String[] argc)throws Exception
{
System.err.println("Now Let's Go!");
GetBaiduInfor getbaiduinfor = new GetBaiduInfor();
getbaiduinfor.baiduparser(getbaiduinfor.getHtml("http://www.baidu.com/s?wd=中国"));
System.err.println("Good bye!");
}
}
本文来自ChinaUnix博客,如果查看原文请点:http://blog.chinaunix.net/u3/104536/showart_2064777.html |
|