医疗相关信息的抓取

yxl1108 发表于 2015-07-22 17:12

这完完全全是我自己一个人写的，整个都写在一个类啊，对获取最新的页面的方式使用的hash对比，这个有时候准确，有时候就不行。一般情况效率还是很好的.

代码:package com.huntto.nbinf.dao.impl;
import java.io.File;
import java.io.IOException;
import java.lang.ref.SoftReference;
import java.net.MalformedURLException;
import java.rmi.Naming;
import java.rmi.RemoteException;
import java.rmi.registry.LocateRegistry;
import java.rmi.server.UnicastRemoteObject;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.FSDirectory;
import org.jsoup.Connection;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.huntto.nbinf.dao.IForQtSearch;
import com.huntto.nbinf.dao.IListInfoDAO;
import com.huntto.nbinf.pojo.LucenePage;

public class LucenService extends UnicastRemoteObject implements IForQtSearch {

File filestack,fileindex;
//当前URLS
Stack<String> stackcurt=new Stack<String>();
//下一层URLS
Stack<String> stacknext=new Stack<String>();

Logger logger=Logger.getLogger("lucene");
//设置深度
final int deep;

//当前深度
private int curtdeep=1;

//查询的最大结果数
final int maxresult=1024;

//关键字是标题命中
final int pagetype_title=1;

//关键字是meta命中
final int pagetype_meta=2;

//关键字是body命中
final int pagetype_body=3;

//超过这个数就写到文件
final int max_to_writefile=10;

//方便处理日期
final SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd");

//庖丁分词器
private Analyzer analyzer=new PaodingAnalyzer();

//dui
private LuceneStackControll controll=new LuceneStackControll();

//文件控制类，文件中存放的是需要处理的url队列，由于队列的长度可能会很大，因此存放在文件中
private FileControll fileControll=new FileControll();

//lucene 文件操作
private FSDirectory directory;

//lucene 更新索引文件
IndexWriter indexWriter;

Date yesday,today;
//rs 关键字分组,
String urls[],websites[][],rs[][];

SoftReference<String[][]> softReference=new SoftReference<String[][]>(websites);
final String userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64) "
         + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36";

protected IListInfoDAO halleyCurrencyDao;

public LucenService() throws RemoteException{

   Properties properties=new Properties();
   int prodeep=5;
   int luceneport=8999;
   String lucenename="lucenesearchs";
   File dirc=null;

   try {
         properties.load(LucenService.class.getClassLoader().
               getResourceAsStream("paoding-dic-names.properties"));
         dirc=new File(properties.getProperty("dirc"));
         directory=FSDirectory.getDirectory(dirc);
         IndexWriter.unlock(directory);
         indexWriter=new IndexWriter(directory,analyzer,MaxFieldLength.UNLIMITED);

         prodeep=Integer.parseInt(properties.getProperty("deep"));
         luceneport=Integer.parseInt(properties.getProperty("luceneport"));
         lucenename=properties.getProperty("lucenename");
   } catch (IOException e1) {
         e1.printStackTrace();
   }

   deep=prodeep;

   if(dirc==null)
         dirc=new File("c:/dirc");
         LocateRegistry.createRegistry(8999);
         try {
            Naming.rebind("rmi://localhost:"+luceneport+"/"+lucenename+"", this);
         } catch (MalformedURLException e) {
            e.printStackTrace();
         }

}

/**
* 开始工作
*/
public void wok(){


   logger.debug("开始工作");

   if(websites==null||websites.length==0){
         try {
            //只从指定的网站开始抓取
            websites=halleyCurrencyDao.opensExcuteSQL("select MONITORURL from CONSENSUSINTERNET where rownum<10");
         } catch (IOException e) {
            e.printStackTrace();
         }
   }
urls=new String;
   int j=0;
   if(websites!=null){
         for(int i=0;i<websites.length;i++){
            if(i%10==0&&i!=0){
               j=0;
               startWork(urls);
            }
            urls=websites;
         }

         if(j!=0){
            startWork(urls);
         }
   }


}
private void initdate(){
   Calendar calendar=Calendar.getInstance();
   calendar.add(Calendar.DATE,-1);
   yesday=calendar.getTime();
   today=new Date();
}

/**
* 开始工作
* 对urls都放在stackcurts里面，
* 如果stackcurt为空就会从文件或者stacknext里面读取url赋值到stackcurt，
* 如果从文件或stackcurt都不能获得，就应该结束循环
*
* @param urls 需要处理的urls
*/
public void startWork(String...urls){

   curtdeep=1;
   initdate();
   for (int i = 0; i < urls.length; i++)
         stackcurt.push(urls);

   controll.fin=false;
   while(true)
      {
         if(stackcurt.isEmpty()){
            if(controll.fin
                     &&controll.sygs<=0
                     &&controll.curtindex>0){
               controll.fin=false;
               if(controll.indata.delete())
                     controll.indata.delete();
            }
            getCurtStack();
         }
         if(stackcurt.isEmpty())break;
         try {
            pageContent(stackcurt.pop(),false);
         } catch (Exception e) {
            e.printStackTrace();
         }
      }
   curtdeep=1;
}
/**
* 得到页面内容
* @param url
* @param deeporshenheindex 参数指示是否分析页面内的链接和页面审核状态
* 强制索引不管是不是刚刚索引过
* @throws IOException
*/
public void pageContent(String url,boolean shenhe) throws IOException{

   Connection connection=HttpConnection.connect(url);
   connection.ignoreContentType(true);
   connection.ignoreHttpErrors(true);
   connection.userAgent(userAgent);

   org.jsoup.nodes.Document document= connection.get();
   IndexInfo indexInfo=new IndexInfo();

   Elements elements=document.getElementsByTag("meta");
   indexInfo.string=document.title();
   indexInfo.type=pagetype_title;
   fenZhu(indexInfo);


   Iterator<Element> iterator=elements.iterator();
   indexInfo.type=pagetype_meta;
   while(iterator.hasNext()){
         Element element=iterator.next();
         if("Keywords".equalsIgnoreCase(element.attr("name"))
               ||"Description".equalsIgnoreCase(element.attr("name"))){
               indexInfo.string=element.attr("content");
               fenZhu(indexInfo);
         }
   }

   elements=document.getElementsByTag("body");
   if(elements.size()>0)
   {
         indexInfo.type=pagetype_body;
         indexInfo.string=elements.get(0).text();
         fenZhu(indexInfo);
   }
   indexInfo.url=url;
   if(indexInfo.group_keyword!=null){
         indexInfo.store=true;
         indexInfo.group=indexInfo.group_keyword;
         indexInfo.keyword=indexInfo.group_keyword;
         indexInfo.string=document.text();
         indexInfo.shenhe=shenhe;
         indexInfo.title=document.title();
   }else
         indexInfo.store=false;

      if(isUpdatePage(md5(indexInfo.url), md5(indexInfo.string))||shenhe)
         indexPageContent(indexInfo);

   if(curtdeep<this.deep){
         elements=document.getElementsByTag("a");
         iterator=elements.iterator();
         while(iterator.hasNext()){
            Element element=iterator.next();
            String attrurl=element.attr("href");
            attrurl=processUrl(attrurl, url);
            try {
               if((!isIndexed(attrurl))&&
                        linkXg(element))
               if(!stacknext.contains(attrurl))
               {
                     stacknext.push(attrurl);
                     System.out.println(attrurl+"添加到队列");
                     if(stacknext.size()>max_to_writefile)
                        controll.filecount+=writeToFile();
               }

            } catch (IOException e) {
               e.printStackTrace();
            }
         }
   }

}

/**
* 判断这个链接是不是先关
* @param link
* @return
*/
public boolean linkXg(Element link){
   if( !constainsKeyWord(link.text())){
         link=link.parent();
         if(link.tagName().equalsIgnoreCase("body"))
            return false;
         if( !constainsKeyWord(link.text()))
         return false;
   }
   return true;
}

/**
* 获取url
*/
   private void getCurtStack(){

      //这是新一层的开始
      if(!controll.fin)
      {
            logger.debug("从stacknext获得url");
            curtdeep++;
            stackcurt=stacknext;
            stacknext=new Stack<String>();
            if(controll.filecount>max_to_writefile)
            {
               controll.fin=true;
               controll.swapInOut();
            }
      }
      else
      {
         logger.debug("从文件"+controll.indata.getName()+"获得url");
            stackcurt=readStackFile(controll.curtindex,max_to_writefile);
            controll.curtindex+=stackcurt.size();
      }
   }


   /**
      *
      *把next写到文件
      *
      * @return 写入的个数
      */
   private int writeToFile(){
         int count=0;
         try {
            count=fileControll.writeStringArr(stacknext.toArray(new String[]{}),
                     controll.outdata,
                     controll.outdata);

            logger.debug("将多余的url写入到文件"+controll.outdata.getName());
         } catch (IOException e) {
            e.printStackTrace();
         }
         if(count>0)
            stacknext=new Stack<String>();

         return count;
   }
   /**
      * 读取
      * @param n
      * @param total
      * @return
      */
   public Stack<String> readStackFile(int n,int total){
         String[] strings=null;
         Stack<String> stack=null;
         try {
            strings=fileControll.readStringArr(
                     controll,
                     n, total);
         } catch (IOException e) {
            e.printStackTrace();
         }
         if(strings!=null){
            stack=new Stack<String>();
            for(int i=0;i<strings.length;i++){
               stack.push(strings);
            }
         }
         return stack;
   }
/**
*获得页面内容
* @param string
* @param url
* @param title
* @throws IOException
*/
private void indexPageContent(IndexInfo indexInfo) throws IOException{

      Document document=new Document();
   document.add(new Field("url",indexInfo.url,Store.YES,Index.ANALYZED));
   document.add(new Field("md5code", md5(indexInfo.url),Store.YES,Index.ANALYZED));
   document.add(new Field("id", md5(indexInfo.url)+System.currentTimeMillis(),Store.YES,Index.ANALYZED));

   if(indexInfo.store){

         if(today==null)
            initdate();

         System.out.println("存储"+indexInfo.url+"关键字："+indexInfo.keyword+"\t"+indexInfo.title);

         logger.debug("存储url"+indexInfo.url);
         document.add(new Field("title", indexInfo.title,Store.YES,Index.ANALYZED));
         document.add(new Field("store", "t",Store.YES,Index.ANALYZED));
         document.add(new Field("content",indexInfo.string,Store.NO,Index.ANALYZED));
         document.add(new Field("shenhe",indexInfo.shenhe?"1":"0",Store.YES,Index.ANALYZED));
         document.add(new Field("group", indexInfo.group, Store.YES,Index.ANALYZED));
         document.add(new Field("keyword", indexInfo.keyword, Store.YES,Index.ANALYZED));
         document.add(new Field("score", ""+indexInfo.score, Store.YES,Index.ANALYZED));
         document.add(new Field("ctime",today.getTime()+"" ,Store.YES,Index.ANALYZED));
         document.add(new Field("md5contentcode", md5(indexInfo.string),Store.YES,Index.ANALYZED));


   }else
         document.add(new Field("store", "f",Store.YES,Index.ANALYZED));

   indexWriter.updateDocument(new Term("md5code", md5(indexInfo.url)),document);
   indexWriter.commit();
   indexWriter.optimize();

}
/**
* 关键字分组
* @param content
* @return
* @throws IOException
*/
private void fenZhu(IndexInfo indexInfo) throws IOException{
   indexInfo.string=indexInfo.string.replace("\\s", "");
   if(rs==null)
         //根据页面的信息判断是不是应该
         rs= halleyCurrencyDao.opensExcuteSQL("select t2.keyword,t3.keyword from CONSENSUSTEAM_KEY t1 LEFT JOINCONSENSUSKEYWORD t2 on t1.keyid=T2.id left join CONSENSUSKEYWORDTEAM t3 on t1.teamid=t3.teamid");
   StringBuilder keyword=null;
   StringBuilder keywordtype=null;
   for(int i=0;i<rs.length;i++){
         if(indexInfo.string.contains(rs))
         {
            switch(indexInfo.type){
               case pagetype_title:
                     indexInfo.score+=10;
               break;
               case pagetype_meta:
                     indexInfo.score+=5;
                     break;
               case pagetype_body:
                     indexInfo.score+=1;
                     break;
            }

            if(indexInfo.group_keyword==null)
               indexInfo.group_keyword=new String;
            else if(indexInfo.group_keyword!=null){
                     if(indexInfo.group_keyword.contains(rs))
                        continue;
                     if(indexInfo.group_keyword.contains(rs))
                        continue;
            }

            if(keyword==null){
               keyword=new StringBuilder();
               keywordtype=new StringBuilder();
            }
            if(keyword.indexOf(rs)<0)
               keyword.append("#"+rs);
            if(keywordtype.indexOf(rs)<0)
               keywordtype.append("#"+rs);
         }
   }

   if(keyword!=null)
   {
         indexInfo.group_keyword=(indexInfo.group_keyword==null?
               "":indexInfo.group_keyword)+keyword.toString();
         indexInfo.group_keyword=(
               indexInfo.group_keyword==null?"":indexInfo.group_keyword
               )+keywordtype.toString();
   }

}
/**
* 查看这个链接是否包含关键字
* @param content
* @return
*/
private boolean constainsKeyWord(String content){
   content=content.replace("\\s", "");
   if(rs==null)
         try {
            rs= halleyCurrencyDao.opensExcuteSQL("select t2.keyword,t3.keyword from CONSENSUSTEAM_KEY t1 LEFT JOINCONSENSUSKEYWORD t2 on t1.keyid=T2.id left join CONSENSUSKEYWORDTEAM t3 on t1.teamid=t3.teamid");
         } catch (IOException e) {
            e.printStackTrace();
         }
      for(int i=0;i<rs.length;i++){
         if(content.contains(rs))
         return true;
   }
   return false;
}

/**
* 审核URL
* @param url
* @throws IOException
*/
public void shenHe(String url) throws IOException{

   logger.debug("审核"+url);
   pageContent(url,true);
}
/**
* 根据ID审核
* @param id
* @throws IOException
*/
public void shenHeById(String id) throws IOException{

   IndexSearcher indexSearcher=new IndexSearcher(directory);
   TermQuery query=new TermQuery(new Term("id", id));

   TopDocs docs= indexSearcher.search(query, 1);

   Document document2=null;
   ScoreDoc[] docss=docs.scoreDocs;
      if(docs!=null&&docss.length>0){
         for (int i = 0; i < docss.length; i++) {
            document2=indexSearcher.doc(docss.doc);
         }
      }
      if(document2!=null){
         shenHe(document2.getField("url").stringValue());

      }

}

/**
* 查询 url是否已经被索引了
* 只能保证一天内的链接不被重复索引
* @param url
* @return
* @throws IOException
*/
private boolean isIndexed(String url) throws IOException{


   if(!directory.fileExists("segments.gen"))
         return false;

   IndexSearcher indexSearcher=new IndexSearcher(directory);
   TermQuery query=new TermQuery(new Term("md5code", md5(url)));
   BooleanQuery booleanQuery=new BooleanQuery();
   booleanQuery.add(query,Occur.MUST);
   booleanQuery.add(
            new RangeQuery(
            new Term("ctime",yesday.getTime()+""),
            new Term("ctime",today.getTime()+""),
            true),
            Occur.MUST );
   //保证相同的链接一天只能抓取一次
   TopDocs docs= indexSearcher.search(booleanQuery, 1);
   indexSearcher.close();

   if(docs.scoreDocs.length>0){
         logger.debug("已结被索引了"+url);
   }

   return docs.scoreDocs.length>0?true:false;
}

/**
* 判断页面是不是最新的
* @param md5code
* @param contentMD5
* @return
* @throws IOException
*/
private boolean isUpdatePage(String md5code,String contentMD5) throwsIOException{

   if(!directory.fileExists("segments.gen"))
         return true;

   IndexSearcher indexSearcher=new IndexSearcher(directory);
   TermQuery query=new TermQuery(new Term("md5code", md5code));
   BooleanQuery booleanQuery=new BooleanQuery();
   booleanQuery.add(query,Occur.MUST);

   TopDocs docs=indexSearcher.search(query, 1);
   for(ScoreDoc scoredoc:docs.scoreDocs){
         Document document=indexSearcher.doc(scoredoc.doc);

         String store=document.get("store");
         if("f".equals(store))
            return false;

         String contenthash=document.get("md5contentcode");
         if(contentMD5.equals(contenthash))
         {
            logger.debug("页面内容没有变化"+document.get("url"));
            return false;
         }
   }

   return true;
}

/**
*
* @param datas 关键字时间范围分组
* @return
* @throws IOException
*/
public LucenePagesearch(LucenePage page,String... datas) throws IOException,RemoteException{

   IndexSearcher indexSearcher=new IndexSearcher(directory);

      BooleanQuery booleanQuery=new BooleanQuery();

      Filter filter=null;

      int start=0;
      int end=10;

      //分页
      if(page!=null){
         start=(page.getPageNo()-1)*page.getPageSize();
         end=start+page.getPageSize();
      }
      if(datas!=null&&"1".equals(datas)){
         booleanQuery.add(new TermQuery(new Term("shenhe","1")),Occur.MUST);//查询审核过的
      }

      if(datas!=null
            &&!"".equals(datas.trim())){

               char chars[]=datas.toCharArray();
               if(datas.length()>2){
                     BooleanQuery booleanQuery2=new BooleanQuery();
                     for(int i=0;i<chars.length-1;i++){
                        booleanQuery2.add(new TermQuery(new Term("content",new String(chars, i,2))), Occur.MUST);
                     }
                     booleanQuery.add(booleanQuery2, Occur.MUST);
               }else
                     booleanQuery.add(new FuzzyQuery(new Term("content", datas)),Occur.MUST);
               }
   if(datas.length>2&&datas!=null&&datas!=null){

         if(!"".equals(datas)&&!"".equals(datas))
         {
         try {
            filter=new RangeFilter("ctime",
                     ""+dateFormat.parse(datas).getTime(),
                     ""+dateFormat.parse(datas).getTime(),
                     true, true);
         } catch (java.text.ParseException e) {
            e.printStackTrace();
         }
         }

   }
   if(datas.length>3&&datas!=null){
         try {
            booleanQuery.add(new QueryParser("group",analyzer).parse(datas),Occur.MUST);
         } catch (ParseException e) {
            e.printStackTrace();
         }
   }

   List<String[]>results=new ArrayList<String[]>();

   booleanQuery.add(new TermQuery(new Term("store", "t")),Occur.MUST);

      Sort sort=new Sort(new SortField[]{
            new SortField("ctime",SortField.LONG,true),
            new SortField("content",SortField.SCORE),
            new SortField("score",SortField.INT,true),
            });
   TopFieldDocs topFieldDocs=indexSearcher.search(booleanQuery, filter, maxresult, sort);
   page.setTotal(topFieldDocs.totalHits);
   if(topFieldDocs!=null&&topFieldDocs.scoreDocs.length>0){
         for(int i=start;i<end&&i<topFieldDocs.scoreDocs.length;i++){

            ScoreDoc scoreDoc=topFieldDocs.scoreDocs;
            Document document=indexSearcher.doc(scoreDoc.doc);

            String[] one_result=new String;
            results.add(one_result);
            one_result=document.get("url");
            one_result=document.get("title");
            Date date=new Date();
            date.setTime(Long.parseLong(document.get("ctime")));
            one_result=dateFormat.format(date);
            one_result=document.get("id");
            one_result=document.get("shenhe");
            one_result=document.get("group");
            one_result=document.get("keyword");
            one_result=document.get("score");
         }

   }
      page.setResult(results);
      indexSearcher.close();
      return page;
}

@Override
public List<Map<String, Object>> getLuceneData(String ids)
         throws RemoteException,IOException {

   List<Map<String, Object>> dataMap=new ArrayList<Map<String,Object>>();
   if(!directory.fileExists("segments.gen"))
         return dataMap;

   String id_s[]=null;
   if(ids!=null){
         id_s=ids.split(",");
   }

   if(id_s!=null){

         BooleanQuery booleanQuery=new BooleanQuery();
         IndexSearcher indexSearcher=new IndexSearcher(directory);
         for(int i=0;i<id_s.length;i++){
            booleanQuery.add(new TermQuery(new Term("id",id_s)),Occur.SHOULD);
         }
         TopDocs docs=indexSearcher.search(booleanQuery,id_s.length);
         for(ScoreDoc scoredoc:docs.scoreDocs){
            Document document=indexSearcher.doc(scoredoc.doc);
            Map<String, Object> map=new HashMap<String, Object>();
            dataMap.add(map);
            map.put("atitle", document.get("title"));
            map.put("url",document.get("url"));
            Date date=new Date();
            long t=Long.parseLong(document.get("ctime"));
            date.setTime(t);
            map.put("adate", dateFormat.format(date) );
            map.put("fromarea",document.get("title"));

         }
         indexSearcher.close();
   }
   return dataMap;

}

public String processUrl(String url,String parenturl){

   //取出主机地址
   Pattern pattern_host=Pattern.compile("http://\\w+\\.\\w+\\.\\w+");

   Pattern pattern=Pattern.compile("^\\w+/\\w+/");
   Pattern pattern2=Pattern.compile("^\\.\\./\\w+");
   Matcher matcher_host=pattern_host.matcher(parenturl);
   Matcher matcher=pattern.matcher(url);
   String host=null;

   if(parenturl.endsWith("/"))
         parenturl=parenturl.substring(0,parenturl.length()-1);
   if(url.endsWith("/"))
         url=url.substring(0,url.length()-1);
   if(matcher_host.find())
         host=matcher_host.group();

      if(host==null)
         return url;

   if(url.startsWith("/"))
         return host+url.substring(1);

   else if(url.startsWith("./"))
         return parenturl+"/"+url.substring(1);

   else if(matcher.find()||
            url.startsWith("../")){

         Matcher matcher2=pattern2.matcher(url);
         while(matcher2.find()){
            url=url.substring(3);
            parenturl=parenturl.substring(0, parenturl.lastIndexOf("/")-1);
            parenturl=parenturl.substring(0, parenturl.lastIndexOf("/"));
         }

         return parenturl+"/"+url;
   }
   if(url.startsWith("#")||url.startsWith("javascript"))
         return parenturl;
   return url;
}

/**
* MD5
* @param plainText
* @return
*/
public static String md5(String plainText) {
   try {
      MessageDigest md = MessageDigest.getInstance("MD5");
      md.update(plainText.getBytes());
      byte b[] = md.digest();

      int i;

      StringBuffer buf = new StringBuffer();
      for (int offset = 0; offset < b.length; offset++) {
      i = b;
      if (i < 0)
      i += 256;
      if (i < 16)
      buf.append("0");
      buf.append(Integer.toHexString(i));
      }

      return buf.toString().substring(8, 24);
   } catch (NoSuchAlgorithmException e) {
      e.printStackTrace();
   }
   returnnull;
   }

public IListInfoDAO getHalleyCurrencyDao() {
   return halleyCurrencyDao;
}

public void setHalleyCurrencyDao(IListInfoDAO halleyCurrencyDao) {
   this.halleyCurrencyDao = halleyCurrencyDao;
}
public static class IndexInfo{
   private int score;
   private int type;
   private String string;
   private String url;
   private String title;
   private String group;
   private String keyword;
   private boolean shenhe;
   private boolean store;
//当前关键字分组
   private String[] group_keyword;

}

public static void main(String[] args){
   Logger logger=Logger.getLogger("");
   logger.info("nihao");
}
@Override
public void shenhe(String id) throws IOException, RemoteException {
   shenHeById(id);
}

}

renxiao2003 发表于 2015-08-12 11:14

楼主想要抓取什么？

页: [1]

Chinaunix's Archiver

医疗相关信息的抓取