论坛徽章:: 0

电梯直达

1楼 [收藏(0)] [报告]

发表于 2009-10-06 12:58 |只看该作者 |倒序浏览

package test;
import java.io.*;
import java.util.*;
import java.text.*;
import java.net.*;
import java.util.regex.*;
/**
* 提取百度搜索结果标题链接摘要
* @author 静止的流水
*
*/
public class GetBaiduInfor
{
/**
   * 下载页面源码
   * @param urlString
   * @return
   */
   public String getHtml(String urlString) {
      try {
      StringBuffer html = new StringBuffer();
      URL url = new URL(urlString);
      HttpURLConnection conn = (HttpURLConnection) url.openConnection();
      InputStreamReader isr = new InputStreamReader(conn.getInputStream());
      BufferedReader br = new BufferedReader(isr);
      String temp;
      while ((temp = br.readLine()) != null) {
      html.append(temp).append("\n");
      }
      br.close();
      isr.close();
      return html.toString();
      } catch (Exception e) {
      e.printStackTrace();
      return null;
      }
      }
   /**
   * 字符串查找
   * @param expression 正则表达式字符串
   * @param text 要进行查找操作的字符串
   * @param str 要查找的字符串
   */
   private void findText(String expression, String text, String str) {
   Pattern p = Pattern.compile(expression); // 正则表达式
   Matcher m = p.matcher(text); // 操作的字符串
   StringBuffer sb = new StringBuffer();
   int i = 0;
   while (m.find()) {
   m.appendReplacement(sb, str);
   i++;
   }
   m.appendTail(sb);
   System.out.println(sb.toString());
   System.out.println(i);
   }
   /**
   * 提取百度结果标题连接摘要
   * @param html
   */
   public void baiduparser(String html)
   {
      Pattern pattern = Pattern.compile("([\\s\\S]+?)([\\s\\S]+?));
      Matcher m = pattern.matcher(html);

      int i = 1;
      while(m.find())
      {
            System.out.println(i);
            //System.out.println(m.group());
            System.out.println("href = "+m.group(1));
            String chinacode = jiecheng.china(m.group(2));
            System.out.println("Title = "+chinacode);
            String chinacode3 = jiecheng.china(m.group(3));
            System.out.println("Description = "+chinacode3);
            i++;
      }
      System.out.println("m.groupCount = "+m.groupCount());

   }
   /**
   * 判断中文字符
   * @param s
   * @return
   */
   public String china(String s)
   {
            StringBuilder sb = new StringBuilder();
            for(int i=0;is.length();i++)
            {
            if(s.substring(i, i+1).matches("[\u4e00-\u9fa5]"))
            {
      sb.append(s.subSequence(i, i+1));
            }
      }
            return sb.toString();
   }
/**
   * 主函数
   * @param argc
   * @throws Exception
   */

public static void main(String[] argc)throws Exception
{
      System.err.println("Now Let's Go!");
      GetBaiduInfor getbaiduinfor = new GetBaiduInfor();
      getbaiduinfor.baiduparser(getbaiduinfor.getHtml("http://www.baidu.com/s?wd=中国"));
      System.err.println("Good bye!");
}
}

本文来自ChinaUnix博客，如果查看原文请点：http://blog.chinaunix.net/u3/104536/showart_2064777.html

文库|博客

返回列表

Chinaunix › 论坛 › 程序设计 › Java › Java文档中心 › 通过正则表达式提取百度搜索结果

通过正则表达式提取百度搜索结果 [复制链接]

浏览过的版块