论坛徽章:: 0

电梯直达

1楼 [收藏(0)] [报告]

发表于 2009-08-20 10:45 |只看该作者 |倒序浏览

通过正则表达式提取百度搜索结果

package test;

import java.io.*;
import java.util.*;
import java.text.*;
import java.net.*;
import java.util.regex.*;
/**
* 提取百度搜索结果标题链接摘要
* @author 静止的流水
*
*/
public class GetBaiduInfor
{
/**
  * 下载页面源码
  * @param urlString
  * @return
  */
public static String getHtml(String urlString) {
   try {
      StringBuffer html = new StringBuffer();
      URL url = new URL(urlString);
      HttpURLConnection conn = (HttpURLConnection) url.openConnection();
      InputStreamReader isr = new InputStreamReader(conn.getInputStream());
      BufferedReader br = new BufferedReader(isr);
      String temp;
      while ((temp = br.readLine()) != null) {
      html.append(temp).append("\n");
      }
      br.close();
      isr.close();
      return html.toString();
   } catch (Exception e) {
      e.printStackTrace();
      return null;
   }
}
   /**
   * 字符串查找
   * @param expression 正则表达式字符串
   * @param text    要进行查找操作的字符串
   * @param str       要查找的字符串
   */
   private static void findText(String expression, String text, String str) {
      Pattern p = Pattern.compile(expression); // 正则表达式
      Matcher m = p.matcher(text); // 操作的字符串
      StringBuffer sb = new StringBuffer();
      int i = 0;
      while (m.find()) {
            m.appendReplacement(sb, str);
            i++;
      }
      m.appendTail(sb);
      System.out.println(sb.toString());
      System.out.println(i);
   }
   /**
   * 提取百度结果标题连接摘要
   * @param html
   */
   public static void baiduparser(String html)
   {
   Pattern pattern = Pattern.compile("([\\s\\S]+?)([\\s\\S]+?)
/**
  * 主函数
  * @param argc
  * @throws Exception
  */

public  static void  main(String[] argc)throws Exception
{
  System.err.println("Now Let's Go!");
  jiecheng.baiduparser(jiecheng.getHtml("
http://www.baidu.com/s?wd
=中国"));
  System.err.println("Good bye!");
}
}

本文来自ChinaUnix博客，如果查看原文请点：http://blog.chinaunix.net/u2/80678/showart_2032579.html

文库|博客

返回列表

Chinaunix › 论坛 › 程序设计 › Java › Java文档中心 › 提取百度搜索结果

提取百度搜索结果 [复制链接]