- 论坛徽章:
- 0
|
这完完全全是我自己一个人写的,整个都写在一个类啊,对获取最新的页面的方式使用的hash对比,这个有时候准确,有时候就不行。一般情况效率还是很好的.
代码:- package com.huntto.nbinf.dao.impl;
- import java.io.File;
- import java.io.IOException;
- import java.lang.ref.SoftReference;
- import java.net.MalformedURLException;
- import java.rmi.Naming;
- import java.rmi.RemoteException;
- import java.rmi.registry.LocateRegistry;
- import java.rmi.server.UnicastRemoteObject;
- import java.security.MessageDigest;
- import java.security.NoSuchAlgorithmException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Calendar;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.Iterator;
- import java.util.List;
- import java.util.Map;
- import java.util.Properties;
- import java.util.Stack;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import net.paoding.analysis.analyzer.PaodingAnalyzer;
- import org.apache.log4j.Logger;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field;
- import org.apache.lucene.document.Field.Index;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriter.MaxFieldLength;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryParser.ParseException;
- import org.apache.lucene.queryParser.QueryParser;
- import org.apache.lucene.search.BooleanClause.Occur;
- import org.apache.lucene.search.BooleanQuery;
- import org.apache.lucene.search.Filter;
- import org.apache.lucene.search.FuzzyQuery;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.RangeFilter;
- import org.apache.lucene.search.RangeQuery;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.Sort;
- import org.apache.lucene.search.SortField;
- import org.apache.lucene.search.TermQuery;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.search.TopFieldDocs;
- import org.apache.lucene.store.FSDirectory;
- import org.jsoup.Connection;
- import org.jsoup.helper.HttpConnection;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import com.huntto.nbinf.dao.IForQtSearch;
- import com.huntto.nbinf.dao.IListInfoDAO;
- import com.huntto.nbinf.pojo.LucenePage;
-
-
- public class LucenService extends UnicastRemoteObject implements IForQtSearch {
-
-
- File filestack,fileindex;
- //当前URLS
- Stack<String> stackcurt=new Stack<String>();
- //下一层URLS
- Stack<String> stacknext=new Stack<String>();
-
- Logger logger=Logger.getLogger("lucene");
- //设置深度
- final int deep;
-
- //当前深度
- private int curtdeep=1;
-
- //查询的最大结果数
- final int maxresult=1024;
-
- //关键字是标题命中
- final int pagetype_title=1;
-
- //关键字是meta命中
- final int pagetype_meta=2;
-
- //关键字是body命中
- final int pagetype_body=3;
-
- //超过这个数就写到文件
- final int max_to_writefile=10;
-
- //方便处理日期
- final SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd");
-
- //庖丁分词器
- private Analyzer analyzer=new PaodingAnalyzer();
-
- //dui
- private LuceneStackControll controll=new LuceneStackControll();
-
- //文件控制类,文件中存放的是需要处理的url队列,由于队列的长度可能会很大,因此存放在文件中
- private FileControll fileControll=new FileControll();
-
- //lucene 文件操作
- private FSDirectory directory;
-
-
- //lucene 更新索引文件
- IndexWriter indexWriter;
-
- Date yesday,today;
- //rs 关键字分组,
- String urls[],websites[][],rs[][];
-
- SoftReference<String[][]> softReference=new SoftReference<String[][]>(websites);
- final String userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64) "
- + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36";
-
- protected IListInfoDAO halleyCurrencyDao;
-
- public LucenService() throws RemoteException{
-
- Properties properties=new Properties();
- int prodeep=5;
- int luceneport=8999;
- String lucenename="lucenesearchs";
- File dirc=null;
-
- try {
- properties.load(LucenService.class.getClassLoader().
- getResourceAsStream("paoding-dic-names.properties"));
- dirc=new File(properties.getProperty("dirc"));
- directory=FSDirectory.getDirectory(dirc);
- IndexWriter.unlock(directory);
- indexWriter=new IndexWriter(directory,analyzer,MaxFieldLength.UNLIMITED);
-
- prodeep=Integer.parseInt(properties.getProperty("deep"));
- luceneport=Integer.parseInt(properties.getProperty("luceneport"));
- lucenename=properties.getProperty("lucenename");
- } catch (IOException e1) {
- e1.printStackTrace();
- }
-
- deep=prodeep;
-
- if(dirc==null)
- dirc=new File("c:/dirc");
- LocateRegistry.createRegistry(8999);
- try {
- Naming.rebind("rmi://localhost:"+luceneport+"/"+lucenename+"", this);
- } catch (MalformedURLException e) {
- e.printStackTrace();
- }
-
- }
-
- /**
- * 开始工作
- */
- public void wok(){
-
-
- logger.debug("开始工作");
-
- if(websites==null||websites.length==0){
- try {
- //只从指定的网站开始抓取
- websites=halleyCurrencyDao.opensExcuteSQL("select MONITORURL from CONSENSUSINTERNET where rownum<10");
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- urls=new String[10];
- int j=0;
- if(websites!=null){
- for(int i=0;i<websites.length;i++){
- if(i%10==0&&i!=0){
- j=0;
- startWork(urls);
- }
- urls[j++]=websites[i][0];
- }
-
- if(j!=0){
- startWork(urls);
- }
- }
-
-
- }
- private void initdate(){
- Calendar calendar=Calendar.getInstance();
- calendar.add(Calendar.DATE,-1);
- yesday=calendar.getTime();
- today=new Date();
- }
-
- /**
- * 开始工作
- * 对urls都放在stackcurts里面,
- * 如果stackcurt为空就会从文件或者stacknext里面读取url赋值到stackcurt,
- * 如果从文件或stackcurt都不能获得,就应该结束循环
- *
- * @param urls 需要处理的urls
- */
- public void startWork(String...urls){
-
- curtdeep=1;
- initdate();
- for (int i = 0; i < urls.length; i++)
- stackcurt.push(urls[i]);
-
- controll.fin=false;
- while(true)
- {
- if(stackcurt.isEmpty()){
- if(controll.fin
- &&controll.sygs<=0
- &&controll.curtindex>0){
- controll.fin=false;
- if(controll.indata[0].delete())
- controll.indata[1].delete();
- }
- getCurtStack();
- }
- if(stackcurt.isEmpty()) break;
- try {
- pageContent(stackcurt.pop(),false);
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- curtdeep=1;
- }
- /**
- * 得到页面内容
- * @param url
- * @param deeporshenheindex 参数指示是否分析页面内的链接和页面审核状态
- * 强制索引不管是不是刚刚索引过
- * @throws IOException
- */
- public void pageContent(String url,boolean shenhe) throws IOException{
-
- Connection connection=HttpConnection.connect(url);
- connection.ignoreContentType(true);
- connection.ignoreHttpErrors(true);
- connection.userAgent(userAgent);
-
- org.jsoup.nodes.Document document= connection.get();
- IndexInfo indexInfo=new IndexInfo();
-
- Elements elements=document.getElementsByTag("meta");
- indexInfo.string=document.title();
- indexInfo.type=pagetype_title;
- fenZhu(indexInfo);
-
-
- Iterator<Element> iterator=elements.iterator();
- indexInfo.type=pagetype_meta;
- while(iterator.hasNext()){
- Element element=iterator.next();
- if("Keywords".equalsIgnoreCase(element.attr("name"))
- ||"Description".equalsIgnoreCase(element.attr("name"))){
- indexInfo.string=element.attr("content");
- fenZhu(indexInfo);
- }
- }
-
- elements=document.getElementsByTag("body");
- if(elements.size()>0)
- {
- indexInfo.type=pagetype_body;
- indexInfo.string=elements.get(0).text();
- fenZhu(indexInfo);
- }
- indexInfo.url=url;
- if(indexInfo.group_keyword!=null){
- indexInfo.store=true;
- indexInfo.group=indexInfo.group_keyword[1];
- indexInfo.keyword=indexInfo.group_keyword[0];
- indexInfo.string=document.text();
- indexInfo.shenhe=shenhe;
- indexInfo.title=document.title();
- }else
- indexInfo.store=false;
-
- if(isUpdatePage(md5(indexInfo.url), md5(indexInfo.string))||shenhe)
- indexPageContent(indexInfo);
-
- if(curtdeep<this.deep){
- elements=document.getElementsByTag("a");
- iterator=elements.iterator();
- while(iterator.hasNext()){
- Element element=iterator.next();
- String attrurl=element.attr("href");
- attrurl=processUrl(attrurl, url);
- try {
- if((!isIndexed(attrurl))&&
- linkXg(element))
- if(!stacknext.contains(attrurl))
- {
- stacknext.push(attrurl);
- System.out.println(attrurl+"添加到队列");
- if(stacknext.size()>max_to_writefile)
- controll.filecount+=writeToFile();
- }
-
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
-
- }
-
- /**
- * 判断这个链接是不是先关
- * @param link
- * @return
- */
- public boolean linkXg(Element link){
- if( !constainsKeyWord(link.text())){
- link=link.parent();
- if(link.tagName().equalsIgnoreCase("body"))
- return false;
- if( !constainsKeyWord(link.text()))
- return false;
- }
- return true;
- }
-
- /**
- * 获取url
- */
- private void getCurtStack(){
-
- //这是新一层的开始
- if(!controll.fin)
- {
- logger.debug("从stacknext获得url");
- curtdeep++;
- stackcurt=stacknext;
- stacknext=new Stack<String>();
- if(controll.filecount>max_to_writefile)
- {
- controll.fin=true;
- controll.swapInOut();
- }
- }
- else
- {
- logger.debug("从文件"+controll.indata[0].getName()+"获得url");
- stackcurt=readStackFile(controll.curtindex,max_to_writefile);
- controll.curtindex+=stackcurt.size();
- }
- }
-
-
- /**
- *
- * 把next写到文件
- *
- * @return 写入的个数
- */
- private int writeToFile(){
- int count=0;
- try {
- count=fileControll.writeStringArr(stacknext.toArray(new String[]{}),
- controll.outdata[0],
- controll.outdata[1]);
-
- logger.debug("将多余的url写入到文件"+controll.outdata[0].getName());
- } catch (IOException e) {
- e.printStackTrace();
- }
- if(count>0)
- stacknext=new Stack<String>();
-
- return count;
- }
- /**
- * 读取
- * @param n
- * @param total
- * @return
- */
- public Stack<String> readStackFile(int n,int total){
- String[] strings=null;
- Stack<String> stack=null;
- try {
- strings=fileControll.readStringArr(
- controll,
- n, total);
- } catch (IOException e) {
- e.printStackTrace();
- }
- if(strings!=null){
- stack=new Stack<String>();
- for(int i=0;i<strings.length;i++){
- stack.push(strings[i]);
- }
- }
- return stack;
- }
- /**
- * 获得页面内容
- * @param string
- * @param url
- * @param title
- * @throws IOException
- */
- private void indexPageContent(IndexInfo indexInfo) throws IOException{
-
- Document document=new Document();
- document.add(new Field("url",indexInfo.url,Store.YES,Index.ANALYZED));
- document.add(new Field("md5code", md5(indexInfo.url),Store.YES,Index.ANALYZED));
- document.add(new Field("id", md5(indexInfo.url)+System.currentTimeMillis(),Store.YES,Index.ANALYZED));
-
- if(indexInfo.store){
-
- if(today==null)
- initdate();
-
- System.out.println("存储"+indexInfo.url+"关键字:"+indexInfo.keyword+"\t"+indexInfo.title);
-
- logger.debug("存储url"+indexInfo.url);
- document.add(new Field("title", indexInfo.title,Store.YES,Index.ANALYZED));
- document.add(new Field("store", "t",Store.YES,Index.ANALYZED));
- document.add(new Field("content",indexInfo.string,Store.NO,Index.ANALYZED));
- document.add(new Field("shenhe",indexInfo.shenhe?"1":"0",Store.YES,Index.ANALYZED));
- document.add(new Field("group", indexInfo.group, Store.YES,Index.ANALYZED));
- document.add(new Field("keyword", indexInfo.keyword, Store.YES,Index.ANALYZED));
- document.add(new Field("score", ""+indexInfo.score, Store.YES,Index.ANALYZED));
- document.add(new Field("ctime",today.getTime()+"" ,Store.YES,Index.ANALYZED));
- document.add(new Field("md5contentcode", md5(indexInfo.string),Store.YES,Index.ANALYZED));
-
-
- }else
- document.add(new Field("store", "f",Store.YES,Index.ANALYZED));
-
- indexWriter.updateDocument(new Term("md5code", md5(indexInfo.url)),document);
- indexWriter.commit();
- indexWriter.optimize();
-
- }
- /**
- * 关键字分组
- * @param content
- * @return
- * @throws IOException
- */
- private void fenZhu(IndexInfo indexInfo) throws IOException{
- indexInfo.string=indexInfo.string.replace("\\s", "");
- if(rs==null)
- //根据页面的信息判断是不是应该
- rs= halleyCurrencyDao.opensExcuteSQL("select t2.keyword,t3.keyword from CONSENSUSTEAM_KEY t1 LEFT JOIN CONSENSUSKEYWORD t2 on t1.keyid=T2.id left join CONSENSUSKEYWORDTEAM t3 on t1.teamid=t3.teamid");
- StringBuilder keyword=null;
- StringBuilder keywordtype=null;
- for(int i=0;i<rs.length;i++){
- if(indexInfo.string.contains(rs[i][0]))
- {
- switch(indexInfo.type){
- case pagetype_title:
- indexInfo.score+=10;
- break;
- case pagetype_meta:
- indexInfo.score+=5;
- break;
- case pagetype_body:
- indexInfo.score+=1;
- break;
- }
-
- if(indexInfo.group_keyword==null)
- indexInfo.group_keyword=new String[2];
- else if(indexInfo.group_keyword[0]!=null){
- if(indexInfo.group_keyword[0].contains(rs[i][0]))
- continue;
- if(indexInfo.group_keyword[1].contains(rs[i][1]))
- continue;
- }
-
- if(keyword==null){
- keyword=new StringBuilder();
- keywordtype=new StringBuilder();
- }
- if(keyword.indexOf(rs[i][0])<0)
- keyword.append("#"+rs[i][0]);
- if(keywordtype.indexOf(rs[i][1])<0)
- keywordtype.append("#"+rs[i][1]);
- }
- }
-
- if(keyword!=null)
- {
- indexInfo.group_keyword[0]=(indexInfo.group_keyword[0]==null?
- "":indexInfo.group_keyword[0])+keyword.toString();
- indexInfo.group_keyword[1]=(
- indexInfo.group_keyword[1]==null?"":indexInfo.group_keyword[1]
- )+keywordtype.toString();
- }
-
- }
- /**
- * 查看这个链接是否包含关键字
- * @param content
- * @return
- */
- private boolean constainsKeyWord(String content){
- content=content.replace("\\s", "");
- if(rs==null)
- try {
- rs= halleyCurrencyDao.opensExcuteSQL("select t2.keyword,t3.keyword from CONSENSUSTEAM_KEY t1 LEFT JOIN CONSENSUSKEYWORD t2 on t1.keyid=T2.id left join CONSENSUSKEYWORDTEAM t3 on t1.teamid=t3.teamid");
- } catch (IOException e) {
- e.printStackTrace();
- }
- for(int i=0;i<rs.length;i++){
- if(content.contains(rs[i][0]))
- return true;
- }
- return false;
- }
-
- /**
- * 审核URL
- * @param url
- * @throws IOException
- */
- public void shenHe(String url) throws IOException{
-
- logger.debug("审核"+url);
- pageContent(url,true);
- }
- /**
- * 根据ID审核
- * @param id
- * @throws IOException
- */
- public void shenHeById(String id) throws IOException{
-
- IndexSearcher indexSearcher=new IndexSearcher(directory);
- TermQuery query=new TermQuery(new Term("id", id));
-
- TopDocs docs= indexSearcher.search(query, 1);
-
- Document document2=null;
- ScoreDoc[] docss=docs.scoreDocs;
- if(docs!=null&&docss.length>0){
- for (int i = 0; i < docss.length; i++) {
- document2=indexSearcher.doc(docss[i].doc);
- }
- }
- if(document2!=null){
- shenHe(document2.getField("url").stringValue());
-
- }
-
- }
-
- /**
- * 查询 url是否已经被索引了
- * 只能保证一天内的链接不被重复索引
- * @param url
- * @return
- * @throws IOException
- */
- private boolean isIndexed(String url) throws IOException{
-
-
- if(!directory.fileExists("segments.gen"))
- return false;
-
- IndexSearcher indexSearcher=new IndexSearcher(directory);
- TermQuery query=new TermQuery(new Term("md5code", md5(url)));
- BooleanQuery booleanQuery=new BooleanQuery();
- booleanQuery.add(query,Occur.MUST);
- booleanQuery.add(
- new RangeQuery(
- new Term("ctime",yesday.getTime()+""),
- new Term("ctime",today.getTime()+""),
- true),
- Occur.MUST );
- //保证相同的链接一天只能抓取一次
- TopDocs docs= indexSearcher.search(booleanQuery, 1);
- indexSearcher.close();
-
- if(docs.scoreDocs.length>0){
- logger.debug("已结被索引了"+url);
- }
-
- return docs.scoreDocs.length>0?true:false;
- }
-
- /**
- * 判断页面是不是最新的
- * @param md5code
- * @param contentMD5
- * @return
- * @throws IOException
- */
- private boolean isUpdatePage(String md5code,String contentMD5) throws IOException{
-
- if(!directory.fileExists("segments.gen"))
- return true;
-
- IndexSearcher indexSearcher=new IndexSearcher(directory);
- TermQuery query=new TermQuery(new Term("md5code", md5code));
- BooleanQuery booleanQuery=new BooleanQuery();
- booleanQuery.add(query,Occur.MUST);
-
- TopDocs docs=indexSearcher.search(query, 1);
- for(ScoreDoc scoredoc:docs.scoreDocs){
- Document document=indexSearcher.doc(scoredoc.doc);
-
- String store=document.get("store");
- if("f".equals(store))
- return false;
-
- String contenthash=document.get("md5contentcode");
- if(contentMD5.equals(contenthash))
- {
- logger.debug("页面内容没有变化"+document.get("url"));
- return false;
- }
- }
-
- return true;
- }
-
- /**
- *
- * @param datas 关键字 时间范围 分组
- * @return
- * @throws IOException
- */
- public LucenePage search(LucenePage page,String... datas) throws IOException,RemoteException{
-
- IndexSearcher indexSearcher=new IndexSearcher(directory);
-
- BooleanQuery booleanQuery=new BooleanQuery();
-
- Filter filter=null;
-
- int start=0;
- int end=10;
-
- //分页
- if(page!=null){
- start=(page.getPageNo()-1)*page.getPageSize();
- end=start+page.getPageSize();
- }
- if(datas[0]!=null&&"1".equals(datas[0])){
- booleanQuery.add(new TermQuery(new Term("shenhe","1")),Occur.MUST);//查询审核过的
- }
-
- if(datas[1]!=null
- &&!"".equals(datas[1].trim())){
-
- char chars[]=datas[1].toCharArray();
- if(datas[1].length()>2){
- BooleanQuery booleanQuery2=new BooleanQuery();
- for(int i=0;i<chars.length-1;i++){
- booleanQuery2.add(new TermQuery(new Term("content",new String(chars, i,2))), Occur.MUST);
- }
- booleanQuery.add(booleanQuery2, Occur.MUST);
- }else
- booleanQuery.add(new FuzzyQuery(new Term("content", datas[1])),Occur.MUST);
- }
- if(datas.length>2&&datas[2]!=null&&datas[3]!=null){
-
- if(!"".equals(datas[2])&&!"".equals(datas[3]))
- {
- try {
- filter=new RangeFilter("ctime",
- ""+dateFormat.parse(datas[2]).getTime(),
- ""+dateFormat.parse(datas[3]).getTime(),
- true, true);
- } catch (java.text.ParseException e) {
- e.printStackTrace();
- }
- }
-
- }
- if(datas.length>3&&datas[4]!=null){
- try {
- booleanQuery.add(new QueryParser("group",analyzer).parse(datas[4]),Occur.MUST);
- } catch (ParseException e) {
- e.printStackTrace();
- }
- }
-
- List<String[]> results=new ArrayList<String[]>();
-
- booleanQuery.add(new TermQuery(new Term("store", "t")),Occur.MUST);
-
- Sort sort=new Sort(new SortField[]{
- new SortField("ctime",SortField.LONG,true),
- new SortField("content",SortField.SCORE),
- new SortField("score",SortField.INT,true),
- });
- TopFieldDocs topFieldDocs=indexSearcher.search(booleanQuery, filter, maxresult, sort);
- page.setTotal(topFieldDocs.totalHits);
- if(topFieldDocs!=null&&topFieldDocs.scoreDocs.length>0){
- for(int i=start;i<end&&i<topFieldDocs.scoreDocs.length;i++){
-
- ScoreDoc scoreDoc=topFieldDocs.scoreDocs[i];
- Document document=indexSearcher.doc(scoreDoc.doc);
-
- String[] one_result=new String[8];
- results.add(one_result);
- one_result[0]=document.get("url");
- one_result[1]=document.get("title");
- Date date=new Date();
- date.setTime(Long.parseLong(document.get("ctime")));
- one_result[2]=dateFormat.format(date);
- one_result[3]=document.get("id");
- one_result[4]=document.get("shenhe");
- one_result[5]=document.get("group");
- one_result[6]=document.get("keyword");
- one_result[7]=document.get("score");
- }
-
- }
- page.setResult(results);
- indexSearcher.close();
- return page;
- }
-
- @Override
- public List<Map<String, Object>> getLuceneData(String ids)
- throws RemoteException,IOException {
-
- List<Map<String, Object>> dataMap=new ArrayList<Map<String,Object>>();
- if(!directory.fileExists("segments.gen"))
- return dataMap;
-
- String id_s[]=null;
- if(ids!=null){
- id_s=ids.split(",");
- }
-
- if(id_s!=null){
-
- BooleanQuery booleanQuery=new BooleanQuery();
- IndexSearcher indexSearcher=new IndexSearcher(directory);
- for(int i=0;i<id_s.length;i++){
- booleanQuery.add(new TermQuery(new Term("id",id_s[i])),Occur.SHOULD);
- }
- TopDocs docs=indexSearcher.search(booleanQuery,id_s.length);
- for(ScoreDoc scoredoc:docs.scoreDocs){
- Document document=indexSearcher.doc(scoredoc.doc);
- Map<String, Object> map=new HashMap<String, Object>();
- dataMap.add(map);
- map.put("atitle", document.get("title"));
- map.put("url", document.get("url"));
- Date date=new Date();
- long t=Long.parseLong(document.get("ctime"));
- date.setTime(t);
- map.put("adate", dateFormat.format(date) );
- map.put("fromarea", document.get("title"));
-
- }
- indexSearcher.close();
- }
- return dataMap;
-
- }
-
- public String processUrl(String url,String parenturl){
-
- //取出主机地址
- Pattern pattern_host=Pattern.compile("http://\\w+\\.\\w+\\.\\w+");
-
- Pattern pattern=Pattern.compile("^\\w+/\\w+/");
- Pattern pattern2=Pattern.compile("^\\.\\./\\w+");
- Matcher matcher_host=pattern_host.matcher(parenturl);
- Matcher matcher=pattern.matcher(url);
- String host=null;
-
- if(parenturl.endsWith("/"))
- parenturl=parenturl.substring(0,parenturl.length()-1);
- if(url.endsWith("/"))
- url=url.substring(0,url.length()-1);
- if(matcher_host.find())
- host=matcher_host.group();
-
- if(host==null)
- return url;
-
- if(url.startsWith("/"))
- return host+url.substring(1);
-
- else if(url.startsWith("./"))
- return parenturl+"/"+url.substring(1);
-
- else if(matcher.find()||
- url.startsWith("../")){
-
- Matcher matcher2=pattern2.matcher(url);
- while(matcher2.find()){
- url=url.substring(3);
- parenturl=parenturl.substring(0, parenturl.lastIndexOf("/")-1);
- parenturl=parenturl.substring(0, parenturl.lastIndexOf("/"));
- }
-
- return parenturl+"/"+url;
- }
- if(url.startsWith("#")||url.startsWith("javascript"))
- return parenturl;
- return url;
- }
-
- /**
- * MD5
- * @param plainText
- * @return
- */
- public static String md5(String plainText) {
- try {
- MessageDigest md = MessageDigest.getInstance("MD5");
- md.update(plainText.getBytes());
- byte b[] = md.digest();
-
- int i;
-
- StringBuffer buf = new StringBuffer();
- for (int offset = 0; offset < b.length; offset++) {
- i = b[offset];
- if (i < 0)
- i += 256;
- if (i < 16)
- buf.append("0");
- buf.append(Integer.toHexString(i));
- }
-
- return buf.toString().substring(8, 24);
- } catch (NoSuchAlgorithmException e) {
- e.printStackTrace();
- }
- return null;
- }
-
- public IListInfoDAO getHalleyCurrencyDao() {
- return halleyCurrencyDao;
- }
-
- public void setHalleyCurrencyDao(IListInfoDAO halleyCurrencyDao) {
- this.halleyCurrencyDao = halleyCurrencyDao;
- }
- public static class IndexInfo{
- private int score;
- private int type;
- private String string;
- private String url;
- private String title;
- private String group;
- private String keyword;
- private boolean shenhe;
- private boolean store;
- //当前关键字分组
- private String[] group_keyword;
-
- }
-
- public static void main(String[] args){
- Logger logger=Logger.getLogger("");
- logger.info("nihao");
- }
- @Override
- public void shenhe(String id) throws IOException, RemoteException {
- shenHeById(id);
- }
-
- }
复制代码 |
|