- 论坛徽章:
- 0
|
问题抽象描述:现有n个文本,已用CLucene建立索引,现不断有待查询文本待确定是否为已知n个文本中的拷贝。
问题解决方法描述:循环用CLucene对每一个待查询文本建立索引(覆盖原有索引),从每个索引中取出每个项及其出现频率,利用直方图相交的方法计算查询文本与所有n个文本的相似值。
实现方法:
static IndexReader* r;
int DocNum;//已知的n个文本个数
double *score;//查询文本与每个已知文本的相似值
bool init()
{
Directory *dir,*dirq;
char directory[250]="E:\\863 Funding\\clucene-core-0.9.21\\indexfile";//已知的n个文本的索引
char directoryQ[250]="E:\\863 Funding\\clucene-core-0.9.21\\indexQfile";//待查询文本的索引
dir= lucene::store::FSDirectory::getDirectory(directory, false);
dirq= lucene::store::FSDirectory::getDirectory(directoryQ, false);
if(IndexReader::isLocked(directory))
IndexReader::unlock(dir);
if(IndexReader::isLocked(directoryQ))
IndexReader::unlock(dirq);
r = IndexReader: pen(directory);
DocNum=r->numDocs();
score=(double *)calloc(DocNum,sizeof(double));
return true;
}
void Detect(const char* directoryQ)//参数为待查询索引文件所在目录
{
IndexReader* q = IndexReader: pen(directoryQ);
//获取待查询索引文件的词频向量,该文件在索引时已有域:
//Field(_T("contents" ,"文本内容",Field::STORE_YES | Field::INDEX_TOKENIZED| Field::TERMVECTOR_YES),第三个参数表示为该域需要存储、分词和保存词向量
TermFreqVector *termFreqVector =q->getTermFreqVector(0,_T("contents" );
//assert(termFreqVector);
int32_t s=termFreqVector->size();
const TCHAR **tchar=termFreqVector->getTerms();
const Array<int32_t> *a=termFreqVector->getTermFrequencies();
int32_t nterms;
Document *docq=q->document(0);
int featNumQ=_ttoi(docq->get(_T("feaNum" ));
for (nterms = 0;nterms<s; nterms++)
{
Term *t=new Term(_T("contents" ,tchar[nterms]);
const int32_t freq=(*a)[nterms];
TermDocs *termDocs = r->termDocs(t);
delete t;
while(termDocs&&termDocs->next())
{
int DocId=termDocs->doc();
Document *doc=r->document(DocId);
const TCHAR *chFeat=doc->get(_T("feaNum" );
int featNum=_ttoi(chFeat);
int maxFeat=featNumQ>featNum?featNumQ:featNum;
score[DocId]+=((freq>termDocs->freq())?termDocs->freq():freq)/(double(maxFeat));
_CLDELETE(doc);
}
termDocs->close();
_CLDELETE(termDocs);
}
_CLDELETE(q);
_CLDELETE(docq);
memset(score,0,DocNum*sizeof(double));
}
main()
{
init();
while(1)
{
char filesQ[250];
char ndxQ[250];
strcpy(filesQ,"E:\\863 Funding\\clucene-core-0.9.21\\dataQfile" ;
strcpy(ndxQ,"E:\\863 Funding\\clucene-core-0.9.21\\indexQfile" ;
Index(filesQ,ndxQ,true);//此函数对待查询文件建索引,实现略。
Detect(ndxQ);
}
|
|