医疗相关信息的抓取
这完完全全是我自己一个人写的,整个都写在一个类啊,对获取最新的页面的方式使用的hash对比,这个有时候准确,有时候就不行。一般情况效率还是很好的.代码:package com.huntto.nbinf.dao.impl;
import java.io.File;
import java.io.IOException;
import java.lang.ref.SoftReference;
import java.net.MalformedURLException;
import java.rmi.Naming;
import java.rmi.RemoteException;
import java.rmi.registry.LocateRegistry;
import java.rmi.server.UnicastRemoteObject;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.paoding.analysis.analyzer.PaodingAnalyzer;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.RangeFilter;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.FSDirectory;
import org.jsoup.Connection;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.huntto.nbinf.dao.IForQtSearch;
import com.huntto.nbinf.dao.IListInfoDAO;
import com.huntto.nbinf.pojo.LucenePage;
public class LucenService extends UnicastRemoteObject implements IForQtSearch {
File filestack,fileindex;
//当前URLS
Stack<String> stackcurt=new Stack<String>();
//下一层URLS
Stack<String> stacknext=new Stack<String>();
Logger logger=Logger.getLogger("lucene");
//设置深度
final int deep;
//当前深度
private int curtdeep=1;
//查询的最大结果数
final int maxresult=1024;
//关键字是标题命中
final int pagetype_title=1;
//关键字是meta命中
final int pagetype_meta=2;
//关键字是body命中
final int pagetype_body=3;
//超过这个数就写到文件
final int max_to_writefile=10;
//方便处理日期
final SimpleDateFormat dateFormat=new SimpleDateFormat("yyyy-MM-dd");
//庖丁分词器
private Analyzer analyzer=new PaodingAnalyzer();
//dui
private LuceneStackControll controll=new LuceneStackControll();
//文件控制类,文件中存放的是需要处理的url队列,由于队列的长度可能会很大,因此存放在文件中
private FileControll fileControll=new FileControll();
//lucene 文件操作
private FSDirectory directory;
//lucene 更新索引文件
IndexWriter indexWriter;
Date yesday,today;
//rs 关键字分组,
String urls[],websites[][],rs[][];
SoftReference<String[][]> softReference=new SoftReference<String[][]>(websites);
final String userAgent="Mozilla/5.0 (Windows NT 6.1; WOW64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36";
protected IListInfoDAO halleyCurrencyDao;
public LucenService() throws RemoteException{
Properties properties=new Properties();
int prodeep=5;
int luceneport=8999;
String lucenename="lucenesearchs";
File dirc=null;
try {
properties.load(LucenService.class.getClassLoader().
getResourceAsStream("paoding-dic-names.properties"));
dirc=new File(properties.getProperty("dirc"));
directory=FSDirectory.getDirectory(dirc);
IndexWriter.unlock(directory);
indexWriter=new IndexWriter(directory,analyzer,MaxFieldLength.UNLIMITED);
prodeep=Integer.parseInt(properties.getProperty("deep"));
luceneport=Integer.parseInt(properties.getProperty("luceneport"));
lucenename=properties.getProperty("lucenename");
} catch (IOException e1) {
e1.printStackTrace();
}
deep=prodeep;
if(dirc==null)
dirc=new File("c:/dirc");
LocateRegistry.createRegistry(8999);
try {
Naming.rebind("rmi://localhost:"+luceneport+"/"+lucenename+"", this);
} catch (MalformedURLException e) {
e.printStackTrace();
}
}
/**
* 开始工作
*/
public void wok(){
logger.debug("开始工作");
if(websites==null||websites.length==0){
try {
//只从指定的网站开始抓取
websites=halleyCurrencyDao.opensExcuteSQL("select MONITORURL from CONSENSUSINTERNET where rownum<10");
} catch (IOException e) {
e.printStackTrace();
}
}
urls=new String;
int j=0;
if(websites!=null){
for(int i=0;i<websites.length;i++){
if(i%10==0&&i!=0){
j=0;
startWork(urls);
}
urls=websites;
}
if(j!=0){
startWork(urls);
}
}
}
private void initdate(){
Calendar calendar=Calendar.getInstance();
calendar.add(Calendar.DATE,-1);
yesday=calendar.getTime();
today=new Date();
}
/**
* 开始工作
* 对urls都放在stackcurts里面,
* 如果stackcurt为空就会从文件或者stacknext里面读取url赋值到stackcurt,
* 如果从文件或stackcurt都不能获得,就应该结束循环
*
* @param urls 需要处理的urls
*/
public void startWork(String...urls){
curtdeep=1;
initdate();
for (int i = 0; i < urls.length; i++)
stackcurt.push(urls);
controll.fin=false;
while(true)
{
if(stackcurt.isEmpty()){
if(controll.fin
&&controll.sygs<=0
&&controll.curtindex>0){
controll.fin=false;
if(controll.indata.delete())
controll.indata.delete();
}
getCurtStack();
}
if(stackcurt.isEmpty())break;
try {
pageContent(stackcurt.pop(),false);
} catch (Exception e) {
e.printStackTrace();
}
}
curtdeep=1;
}
/**
* 得到页面内容
* @param url
* @param deeporshenheindex 参数指示是否分析页面内的链接和页面审核状态
* 强制索引不管是不是刚刚索引过
* @throws IOException
*/
public void pageContent(String url,boolean shenhe) throws IOException{
Connection connection=HttpConnection.connect(url);
connection.ignoreContentType(true);
connection.ignoreHttpErrors(true);
connection.userAgent(userAgent);
org.jsoup.nodes.Document document= connection.get();
IndexInfo indexInfo=new IndexInfo();
Elements elements=document.getElementsByTag("meta");
indexInfo.string=document.title();
indexInfo.type=pagetype_title;
fenZhu(indexInfo);
Iterator<Element> iterator=elements.iterator();
indexInfo.type=pagetype_meta;
while(iterator.hasNext()){
Element element=iterator.next();
if("Keywords".equalsIgnoreCase(element.attr("name"))
||"Description".equalsIgnoreCase(element.attr("name"))){
indexInfo.string=element.attr("content");
fenZhu(indexInfo);
}
}
elements=document.getElementsByTag("body");
if(elements.size()>0)
{
indexInfo.type=pagetype_body;
indexInfo.string=elements.get(0).text();
fenZhu(indexInfo);
}
indexInfo.url=url;
if(indexInfo.group_keyword!=null){
indexInfo.store=true;
indexInfo.group=indexInfo.group_keyword;
indexInfo.keyword=indexInfo.group_keyword;
indexInfo.string=document.text();
indexInfo.shenhe=shenhe;
indexInfo.title=document.title();
}else
indexInfo.store=false;
if(isUpdatePage(md5(indexInfo.url), md5(indexInfo.string))||shenhe)
indexPageContent(indexInfo);
if(curtdeep<this.deep){
elements=document.getElementsByTag("a");
iterator=elements.iterator();
while(iterator.hasNext()){
Element element=iterator.next();
String attrurl=element.attr("href");
attrurl=processUrl(attrurl, url);
try {
if((!isIndexed(attrurl))&&
linkXg(element))
if(!stacknext.contains(attrurl))
{
stacknext.push(attrurl);
System.out.println(attrurl+"添加到队列");
if(stacknext.size()>max_to_writefile)
controll.filecount+=writeToFile();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 判断这个链接是不是先关
* @param link
* @return
*/
public boolean linkXg(Element link){
if( !constainsKeyWord(link.text())){
link=link.parent();
if(link.tagName().equalsIgnoreCase("body"))
return false;
if( !constainsKeyWord(link.text()))
return false;
}
return true;
}
/**
* 获取url
*/
private void getCurtStack(){
//这是新一层的开始
if(!controll.fin)
{
logger.debug("从stacknext获得url");
curtdeep++;
stackcurt=stacknext;
stacknext=new Stack<String>();
if(controll.filecount>max_to_writefile)
{
controll.fin=true;
controll.swapInOut();
}
}
else
{
logger.debug("从文件"+controll.indata.getName()+"获得url");
stackcurt=readStackFile(controll.curtindex,max_to_writefile);
controll.curtindex+=stackcurt.size();
}
}
/**
*
*把next写到文件
*
* @return 写入的个数
*/
private int writeToFile(){
int count=0;
try {
count=fileControll.writeStringArr(stacknext.toArray(new String[]{}),
controll.outdata,
controll.outdata);
logger.debug("将多余的url写入到文件"+controll.outdata.getName());
} catch (IOException e) {
e.printStackTrace();
}
if(count>0)
stacknext=new Stack<String>();
return count;
}
/**
* 读取
* @param n
* @param total
* @return
*/
public Stack<String> readStackFile(int n,int total){
String[] strings=null;
Stack<String> stack=null;
try {
strings=fileControll.readStringArr(
controll,
n, total);
} catch (IOException e) {
e.printStackTrace();
}
if(strings!=null){
stack=new Stack<String>();
for(int i=0;i<strings.length;i++){
stack.push(strings);
}
}
return stack;
}
/**
*获得页面内容
* @param string
* @param url
* @param title
* @throws IOException
*/
private void indexPageContent(IndexInfo indexInfo) throws IOException{
Document document=new Document();
document.add(new Field("url",indexInfo.url,Store.YES,Index.ANALYZED));
document.add(new Field("md5code", md5(indexInfo.url),Store.YES,Index.ANALYZED));
document.add(new Field("id", md5(indexInfo.url)+System.currentTimeMillis(),Store.YES,Index.ANALYZED));
if(indexInfo.store){
if(today==null)
initdate();
System.out.println("存储"+indexInfo.url+"关键字:"+indexInfo.keyword+"\t"+indexInfo.title);
logger.debug("存储url"+indexInfo.url);
document.add(new Field("title", indexInfo.title,Store.YES,Index.ANALYZED));
document.add(new Field("store", "t",Store.YES,Index.ANALYZED));
document.add(new Field("content",indexInfo.string,Store.NO,Index.ANALYZED));
document.add(new Field("shenhe",indexInfo.shenhe?"1":"0",Store.YES,Index.ANALYZED));
document.add(new Field("group", indexInfo.group, Store.YES,Index.ANALYZED));
document.add(new Field("keyword", indexInfo.keyword, Store.YES,Index.ANALYZED));
document.add(new Field("score", ""+indexInfo.score, Store.YES,Index.ANALYZED));
document.add(new Field("ctime",today.getTime()+"" ,Store.YES,Index.ANALYZED));
document.add(new Field("md5contentcode", md5(indexInfo.string),Store.YES,Index.ANALYZED));
}else
document.add(new Field("store", "f",Store.YES,Index.ANALYZED));
indexWriter.updateDocument(new Term("md5code", md5(indexInfo.url)),document);
indexWriter.commit();
indexWriter.optimize();
}
/**
* 关键字分组
* @param content
* @return
* @throws IOException
*/
private void fenZhu(IndexInfo indexInfo) throws IOException{
indexInfo.string=indexInfo.string.replace("\\s", "");
if(rs==null)
//根据页面的信息判断是不是应该
rs= halleyCurrencyDao.opensExcuteSQL("select t2.keyword,t3.keyword from CONSENSUSTEAM_KEY t1 LEFT JOINCONSENSUSKEYWORD t2 on t1.keyid=T2.id left join CONSENSUSKEYWORDTEAM t3 on t1.teamid=t3.teamid");
StringBuilder keyword=null;
StringBuilder keywordtype=null;
for(int i=0;i<rs.length;i++){
if(indexInfo.string.contains(rs))
{
switch(indexInfo.type){
case pagetype_title:
indexInfo.score+=10;
break;
case pagetype_meta:
indexInfo.score+=5;
break;
case pagetype_body:
indexInfo.score+=1;
break;
}
if(indexInfo.group_keyword==null)
indexInfo.group_keyword=new String;
else if(indexInfo.group_keyword!=null){
if(indexInfo.group_keyword.contains(rs))
continue;
if(indexInfo.group_keyword.contains(rs))
continue;
}
if(keyword==null){
keyword=new StringBuilder();
keywordtype=new StringBuilder();
}
if(keyword.indexOf(rs)<0)
keyword.append("#"+rs);
if(keywordtype.indexOf(rs)<0)
keywordtype.append("#"+rs);
}
}
if(keyword!=null)
{
indexInfo.group_keyword=(indexInfo.group_keyword==null?
"":indexInfo.group_keyword)+keyword.toString();
indexInfo.group_keyword=(
indexInfo.group_keyword==null?"":indexInfo.group_keyword
)+keywordtype.toString();
}
}
/**
* 查看这个链接是否包含关键字
* @param content
* @return
*/
private boolean constainsKeyWord(String content){
content=content.replace("\\s", "");
if(rs==null)
try {
rs= halleyCurrencyDao.opensExcuteSQL("select t2.keyword,t3.keyword from CONSENSUSTEAM_KEY t1 LEFT JOINCONSENSUSKEYWORD t2 on t1.keyid=T2.id left join CONSENSUSKEYWORDTEAM t3 on t1.teamid=t3.teamid");
} catch (IOException e) {
e.printStackTrace();
}
for(int i=0;i<rs.length;i++){
if(content.contains(rs))
return true;
}
return false;
}
/**
* 审核URL
* @param url
* @throws IOException
*/
public void shenHe(String url) throws IOException{
logger.debug("审核"+url);
pageContent(url,true);
}
/**
* 根据ID审核
* @param id
* @throws IOException
*/
public void shenHeById(String id) throws IOException{
IndexSearcher indexSearcher=new IndexSearcher(directory);
TermQuery query=new TermQuery(new Term("id", id));
TopDocs docs= indexSearcher.search(query, 1);
Document document2=null;
ScoreDoc[] docss=docs.scoreDocs;
if(docs!=null&&docss.length>0){
for (int i = 0; i < docss.length; i++) {
document2=indexSearcher.doc(docss.doc);
}
}
if(document2!=null){
shenHe(document2.getField("url").stringValue());
}
}
/**
* 查询 url是否已经被索引了
* 只能保证一天内的链接不被重复索引
* @param url
* @return
* @throws IOException
*/
private boolean isIndexed(String url) throws IOException{
if(!directory.fileExists("segments.gen"))
return false;
IndexSearcher indexSearcher=new IndexSearcher(directory);
TermQuery query=new TermQuery(new Term("md5code", md5(url)));
BooleanQuery booleanQuery=new BooleanQuery();
booleanQuery.add(query,Occur.MUST);
booleanQuery.add(
new RangeQuery(
new Term("ctime",yesday.getTime()+""),
new Term("ctime",today.getTime()+""),
true),
Occur.MUST );
//保证相同的链接一天只能抓取一次
TopDocs docs= indexSearcher.search(booleanQuery, 1);
indexSearcher.close();
if(docs.scoreDocs.length>0){
logger.debug("已结被索引了"+url);
}
return docs.scoreDocs.length>0?true:false;
}
/**
* 判断页面是不是最新的
* @param md5code
* @param contentMD5
* @return
* @throws IOException
*/
private boolean isUpdatePage(String md5code,String contentMD5) throwsIOException{
if(!directory.fileExists("segments.gen"))
return true;
IndexSearcher indexSearcher=new IndexSearcher(directory);
TermQuery query=new TermQuery(new Term("md5code", md5code));
BooleanQuery booleanQuery=new BooleanQuery();
booleanQuery.add(query,Occur.MUST);
TopDocs docs=indexSearcher.search(query, 1);
for(ScoreDoc scoredoc:docs.scoreDocs){
Document document=indexSearcher.doc(scoredoc.doc);
String store=document.get("store");
if("f".equals(store))
return false;
String contenthash=document.get("md5contentcode");
if(contentMD5.equals(contenthash))
{
logger.debug("页面内容没有变化"+document.get("url"));
return false;
}
}
return true;
}
/**
*
* @param datas 关键字时间范围分组
* @return
* @throws IOException
*/
public LucenePagesearch(LucenePage page,String... datas) throws IOException,RemoteException{
IndexSearcher indexSearcher=new IndexSearcher(directory);
BooleanQuery booleanQuery=new BooleanQuery();
Filter filter=null;
int start=0;
int end=10;
//分页
if(page!=null){
start=(page.getPageNo()-1)*page.getPageSize();
end=start+page.getPageSize();
}
if(datas!=null&&"1".equals(datas)){
booleanQuery.add(new TermQuery(new Term("shenhe","1")),Occur.MUST);//查询审核过的
}
if(datas!=null
&&!"".equals(datas.trim())){
char chars[]=datas.toCharArray();
if(datas.length()>2){
BooleanQuery booleanQuery2=new BooleanQuery();
for(int i=0;i<chars.length-1;i++){
booleanQuery2.add(new TermQuery(new Term("content",new String(chars, i,2))), Occur.MUST);
}
booleanQuery.add(booleanQuery2, Occur.MUST);
}else
booleanQuery.add(new FuzzyQuery(new Term("content", datas)),Occur.MUST);
}
if(datas.length>2&&datas!=null&&datas!=null){
if(!"".equals(datas)&&!"".equals(datas))
{
try {
filter=new RangeFilter("ctime",
""+dateFormat.parse(datas).getTime(),
""+dateFormat.parse(datas).getTime(),
true, true);
} catch (java.text.ParseException e) {
e.printStackTrace();
}
}
}
if(datas.length>3&&datas!=null){
try {
booleanQuery.add(new QueryParser("group",analyzer).parse(datas),Occur.MUST);
} catch (ParseException e) {
e.printStackTrace();
}
}
List<String[]>results=new ArrayList<String[]>();
booleanQuery.add(new TermQuery(new Term("store", "t")),Occur.MUST);
Sort sort=new Sort(new SortField[]{
new SortField("ctime",SortField.LONG,true),
new SortField("content",SortField.SCORE),
new SortField("score",SortField.INT,true),
});
TopFieldDocs topFieldDocs=indexSearcher.search(booleanQuery, filter, maxresult, sort);
page.setTotal(topFieldDocs.totalHits);
if(topFieldDocs!=null&&topFieldDocs.scoreDocs.length>0){
for(int i=start;i<end&&i<topFieldDocs.scoreDocs.length;i++){
ScoreDoc scoreDoc=topFieldDocs.scoreDocs;
Document document=indexSearcher.doc(scoreDoc.doc);
String[] one_result=new String;
results.add(one_result);
one_result=document.get("url");
one_result=document.get("title");
Date date=new Date();
date.setTime(Long.parseLong(document.get("ctime")));
one_result=dateFormat.format(date);
one_result=document.get("id");
one_result=document.get("shenhe");
one_result=document.get("group");
one_result=document.get("keyword");
one_result=document.get("score");
}
}
page.setResult(results);
indexSearcher.close();
return page;
}
@Override
public List<Map<String, Object>> getLuceneData(String ids)
throws RemoteException,IOException {
List<Map<String, Object>> dataMap=new ArrayList<Map<String,Object>>();
if(!directory.fileExists("segments.gen"))
return dataMap;
String id_s[]=null;
if(ids!=null){
id_s=ids.split(",");
}
if(id_s!=null){
BooleanQuery booleanQuery=new BooleanQuery();
IndexSearcher indexSearcher=new IndexSearcher(directory);
for(int i=0;i<id_s.length;i++){
booleanQuery.add(new TermQuery(new Term("id",id_s)),Occur.SHOULD);
}
TopDocs docs=indexSearcher.search(booleanQuery,id_s.length);
for(ScoreDoc scoredoc:docs.scoreDocs){
Document document=indexSearcher.doc(scoredoc.doc);
Map<String, Object> map=new HashMap<String, Object>();
dataMap.add(map);
map.put("atitle", document.get("title"));
map.put("url",document.get("url"));
Date date=new Date();
long t=Long.parseLong(document.get("ctime"));
date.setTime(t);
map.put("adate", dateFormat.format(date) );
map.put("fromarea",document.get("title"));
}
indexSearcher.close();
}
return dataMap;
}
public String processUrl(String url,String parenturl){
//取出主机地址
Pattern pattern_host=Pattern.compile("http://\\w+\\.\\w+\\.\\w+");
Pattern pattern=Pattern.compile("^\\w+/\\w+/");
Pattern pattern2=Pattern.compile("^\\.\\./\\w+");
Matcher matcher_host=pattern_host.matcher(parenturl);
Matcher matcher=pattern.matcher(url);
String host=null;
if(parenturl.endsWith("/"))
parenturl=parenturl.substring(0,parenturl.length()-1);
if(url.endsWith("/"))
url=url.substring(0,url.length()-1);
if(matcher_host.find())
host=matcher_host.group();
if(host==null)
return url;
if(url.startsWith("/"))
return host+url.substring(1);
else if(url.startsWith("./"))
return parenturl+"/"+url.substring(1);
else if(matcher.find()||
url.startsWith("../")){
Matcher matcher2=pattern2.matcher(url);
while(matcher2.find()){
url=url.substring(3);
parenturl=parenturl.substring(0, parenturl.lastIndexOf("/")-1);
parenturl=parenturl.substring(0, parenturl.lastIndexOf("/"));
}
return parenturl+"/"+url;
}
if(url.startsWith("#")||url.startsWith("javascript"))
return parenturl;
return url;
}
/**
* MD5
* @param plainText
* @return
*/
public static String md5(String plainText) {
try {
MessageDigest md = MessageDigest.getInstance("MD5");
md.update(plainText.getBytes());
byte b[] = md.digest();
int i;
StringBuffer buf = new StringBuffer();
for (int offset = 0; offset < b.length; offset++) {
i = b;
if (i < 0)
i += 256;
if (i < 16)
buf.append("0");
buf.append(Integer.toHexString(i));
}
return buf.toString().substring(8, 24);
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
returnnull;
}
public IListInfoDAO getHalleyCurrencyDao() {
return halleyCurrencyDao;
}
public void setHalleyCurrencyDao(IListInfoDAO halleyCurrencyDao) {
this.halleyCurrencyDao = halleyCurrencyDao;
}
public static class IndexInfo{
private int score;
private int type;
private String string;
private String url;
private String title;
private String group;
private String keyword;
private boolean shenhe;
private boolean store;
//当前关键字分组
private String[] group_keyword;
}
public static void main(String[] args){
Logger logger=Logger.getLogger("");
logger.info("nihao");
}
@Override
public void shenhe(String id) throws IOException, RemoteException {
shenHeById(id);
}
} 楼主想要抓取什么?
页:
[1]