运用Luence建索和检索的方法

喜糖

2013-08-24

关注关注

初学者还在进步勿批！

建索：

import java.io.File;

import java.text.SimpleDateFormat;

import java.util.ArrayList;

import java.util.Date;

import java.util.List;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import com.jpsycn.kfwggl.common.tools.GetRootPath;

import com.jpsycn.kfwggl.common.tools.HandlerSummary;

import com.jpsycn.kfwggl.system.entity.ResultGetInfo;

public class CreateIndex {

//抓取到的页面存放的路径

//String filesPath="F:/kfwlyqTxtList";

//分词

private Analyzer analyzer = new StandardAnalyzer();

public void createIndex(List<ResultGetInfo> lt){

Date d=new Date();

String root=GetRootPath.getIndexesPath();

if(!new File(root).exists()){

new File(root).mkdir();

}

//创建的索引存放路径

String INDEXPATH=root+new SimpleDateFormat("yyyyMMddHHMMSS").format(d)+d.getTime();

System.out.println(INDEXPATH);

if(!new File(INDEXPATH).exists()){

new File(INDEXPATH).mkdir();

}

//获取存放索引的文件夹

try {

SimpleDateFormat ft=new SimpleDateFormat("yyyy-MM-dd");

Directory directory = FSDirectory.getDirectory(INDEXPATH);

IndexWriter indexWriter = new IndexWriter(directory, analyzer ,true, IndexWriter.MaxFieldLength.LIMITED);

long begin = new Date().getTime();

for(ResultGetInfo rg:lt){

//获取一个List<esultGetInfo>遍历里面的值建索

//其中红色titleResult 就是索引如字典中的索引蓝色就是你要建索的字符串

//Field.Store.YES 表示是否存储以后可以检索

//Field.Index.ANALYZED 表示是否分词

Document doc = new Document();

String titleResult=rg.getTitle()==null?"":rg.getTitleResult().trim();

String content =rg.getContent()==null?"":rg.getContent();

String link=rg.getLink()==null?"":rg.getLink().trim();

String releaseDate=rg.getReleaseDate()==null?"":ft.format(rg.getReleaseDate());

doc.add(new Field("titleResult", titleResult, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));

doc.add(new Field("link", link, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));

doc.add(new Field("releaseDate", releaseDate, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));

indexWriter.addDocument(doc);

}

long end = new Date().getTime();

System.out.println(">>> 1.存入索引完毕.. 共花费：" + (end - begin) +"毫秒...");

indexWriter.optimize();

indexWriter.close();

} catch (Exception e) {

e.printStackTrace();

}

检索：

import java.io.File;

import java.util.ArrayList;

import java.util.Date;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.MultiFieldQueryParser;

import org.apache.lucene.search.BooleanClause;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.search.ScoreDoc;

import org.apache.lucene.search.TopDocs;

public class GetResultInfo {

public static List<String> getResultInfos(String keyName,String INDEXPATH,String titleOrContent){

List<String> list=new ArrayList<String>();

Analyzer analyzer = new StandardAnalyzer();

//String titleResult="titleResult";

String link="link";

//String content=titleOrContent;

//String releaseDate="releaseDate";

//索引存放位置

try {

IndexSearcher indexSearcher = new IndexSearcher(INDEXPATH);

//System.out.println(">>> 2.开始读取索引... ... 通过关键字：【 "+ keyName +" 】");

BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD };

Query queryOBJ = MultiFieldQueryParser.parse(keyName, new String[]{titleOrContent}, clauses, analyzer);//parser.parse(query);

//Filter filter = null;

//################# 搜索相似度最高的记录 ###################

//TopDocs topDocs = indexSearcher.search(queryOBJ, filter, 1000);

TopDocs topDocs = indexSearcher.search(queryOBJ , 10000);

//System.out.println("*** 共匹配：" + topDocs.totalHits + "个 ***");

//ResultGetInfo rg = null;

//输出结果

for (ScoreDoc scoreDoc : topDocs.scoreDocs){

* 这里我就返回一个List<String>集合里面存放路径 url

* 这里的link是需要和建索的时候的 link对应的而且相同

Document targetDoc = indexSearcher.doc(scoreDoc.doc);

list.add(targetDoc.get(link).trim());

/*rg = new ResultGetInfo();

//注释掉的是关于高亮显示的部分获取到的是含有html标签的字符串需要你转换

//设置高亮显示格式

SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><strong>", "</strong></font>");

/* 语法高亮显示设置 */

/*Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(queryOBJ));

highlighter.setTextFragmenter(new SimpleFragmenter(100));

// 设置高亮设置 title,content 字段

/*String title = targetDoc.get("titleResult");

String contents = targetDoc.get("content");

TokenStream titleTokenStream = analyzer.tokenStream(titleResult,new StringReader(title));

TokenStream contentTokenStream = analyzer.tokenStream(content,new StringReader(contents));

String highLightTitle = highlighter.getBestFragment(titleTokenStream, title);

String highLightContent = highlighter.getBestFragment(contentTokenStream, contents);

if(highLightTitle == null){

highLightTitle = title;

}

if(highLightContent == null) {

highLightContent = content;

}

rg.setLink(targetDoc.get(link));

rg.setTitleResult(highLightTitle);

rg.setContent(highLightContent);

rg.setReleaseDate(new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").parse(targetDoc.get(releaseDate)+" 00:00:00"));

list.add(rg);*/

}

indexSearcher.close();

return list;

} catch (Exception e) {

e.printStackTrace();

return null;

}

public static List<String> getDirPath(String path){

List<String> dirPaths=new ArrayList<String>();

File f=new File(path);

File files[]=f.listFiles();

if(files.length==0){

System.out.println("没有存放索引的文件夹");

}else{

for(int i=0;i<files.length;i++){

//检索每个存放索引的文件夹

dirPaths.add(files[i].getAbsolutePath());

}

return dirPaths;

}

public static Map<String,String> getInfos(String path,String str[],String titleOrContent){

long begin = new Date().getTime();

Map<String,String> map=new HashMap<String,String>();

//获取存放索引的所有文件夹

List<String> dirPaths=GetResultInfo.getDirPath(path);

for(int k=0;k<str.length;k++){

for(int i=0;i<dirPaths.size();i++){

List<String> infoList=GetResultInfo.getResultInfos(str[k],dirPaths.get(i),titleOrContent);

for(int j=0;j<infoList.size();j++){

map.put(infoList.get(j),infoList.get(j));

}

long end = new Date().getTime();

System.out.println(">>> 搜索完毕... ... 共花费：" + (end - begin) +"毫秒...");

System.out.println("一共检索到"+map.size()+"条");

return map;

}

这个例子是我对解析到的网页的路径、标题、内容、发布日期、来源进行建索

然后通过对内容的检索获取该网页的路径

apache simpledateformat lucene

安科网

运用Luence建索和检索的方法

喜糖

喜糖

相关推荐

MAC OS 10.15 Lucene 源码分析环境搭建

.NET Core下使用Kafka的方法步骤

解决PHPstudy Apache无法启动的问题【亲测有效】

Web安全：文件解析漏洞

终于有人把Nginx说清楚了，图文详解！

为什么Java仍将是未来的主流语言？

如何使用Apache Web服务器来安装和配置网站？

CentOS 8 Apache 安装后 SSL 重定向提示证书错误

如何使用 Apache Directory Studio 连接 JumpCloud

初学者和专业技术人员使用的十大机器学习软件

每个Java开发人员都应该知道的10大Github仓库

漫话：应用程序被拖慢？罪魁祸首竟然是Log4j！

JSP动态网页开发原理详解

centos8使用Apache httpd2.4.37安装web服务器的步骤详解

Tomcat启动springboot项目war包报错：启动子级时出错的问题

如何通过Apache在本地配置多个虚拟主机

Apache Shiro 反序列化(CVE-2016-4437)复现

Apache Shiro 反序列化(CVE-2016-4437)复现

Apache DolphinScheduler 诞生记

【Shiro】05 自定义Realm认证实现

喜糖