运用Luence建索和检索的方法
初学者 还在进步 勿批!
建索:
import java.io.File;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import com.jpsycn.kfwggl.common.tools.GetRootPath;
import com.jpsycn.kfwggl.common.tools.HandlerSummary;
import com.jpsycn.kfwggl.system.entity.ResultGetInfo;
public class CreateIndex {
//抓取到的页面存放的路径
//String filesPath="F:/kfwlyqTxtList";
//分词
private Analyzer analyzer = new StandardAnalyzer();
public void createIndex(List<ResultGetInfo> lt){
Date d=new Date();
String root=GetRootPath.getIndexesPath();
if(!new File(root).exists()){
new File(root).mkdir();
}
//创建的索引存放路径
String INDEXPATH=root+new SimpleDateFormat("yyyyMMddHHMMSS").format(d)+d.getTime();
System.out.println(INDEXPATH);
if(!new File(INDEXPATH).exists()){
new File(INDEXPATH).mkdir();
}
//获取存放索引的文件夹
try {
SimpleDateFormat ft=new SimpleDateFormat("yyyy-MM-dd");
Directory directory = FSDirectory.getDirectory(INDEXPATH);
IndexWriter indexWriter = new IndexWriter(directory, analyzer ,true, IndexWriter.MaxFieldLength.LIMITED);
long begin = new Date().getTime();
for(ResultGetInfo rg:lt){
//获取一个List<esultGetInfo>遍历里面的值 建索
//其中 红色titleResult 就是 索引 如字典中的索引 蓝色就是你要建索的字符串
//Field.Store.YES 表示是否存储 以后可以检索
//Field.Index.ANALYZED 表示是否分词
Document doc = new Document();
String titleResult=rg.getTitle()==null?"":rg.getTitleResult().trim();
String content =rg.getContent()==null?"":rg.getContent();
String link=rg.getLink()==null?"":rg.getLink().trim();
String releaseDate=rg.getReleaseDate()==null?"":ft.format(rg.getReleaseDate());
doc.add(new Field("titleResult", titleResult, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES));
doc.add(new Field("link", link, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));
doc.add(new Field("releaseDate", releaseDate, Field.Store.YES,Field.Index.ANALYZED,Field.TermVector.YES));
indexWriter.addDocument(doc);
}
long end = new Date().getTime();
System.out.println(">>> 1.存入索引完毕.. 共花费:" + (end - begin) +"毫秒...");
indexWriter.optimize();
indexWriter.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
检索:
import java.io.File;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
public class GetResultInfo {
public static List<String> getResultInfos(String keyName,String INDEXPATH,String titleOrContent){
List<String> list=new ArrayList<String>();
Analyzer analyzer = new StandardAnalyzer();
//String titleResult="titleResult";
String link="link";
//String content=titleOrContent;
//String releaseDate="releaseDate";
//索引存放位置
try {
IndexSearcher indexSearcher = new IndexSearcher(INDEXPATH);
//System.out.println(">>> 2.开始读取索引... ... 通过关键字:【 "+ keyName +" 】");
BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD };
Query queryOBJ = MultiFieldQueryParser.parse(keyName, new String[]{titleOrContent}, clauses, analyzer);//parser.parse(query);
//Filter filter = null;
//################# 搜索相似度最高的记录 ###################
//TopDocs topDocs = indexSearcher.search(queryOBJ, filter, 1000);
TopDocs topDocs = indexSearcher.search(queryOBJ , 10000);
//System.out.println("*** 共匹配:" + topDocs.totalHits + "个 ***");
//ResultGetInfo rg = null;
//输出结果
for (ScoreDoc scoreDoc : topDocs.scoreDocs){
/*
* 这里我就返回一个List<String>集合 里面存放路径 url
* 这里的link是需要和建索的时候的 link对应的 而且相同
*/
Document targetDoc = indexSearcher.doc(scoreDoc.doc);
list.add(targetDoc.get(link).trim());
/*rg = new ResultGetInfo();
//注释掉的是关于高亮显示的部分 获取到的是含有html标签的字符串 需要你转换
//设置高亮显示格式
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'><strong>", "</strong></font>");
/* 语法高亮显示设置 */
/*Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(queryOBJ));
highlighter.setTextFragmenter(new SimpleFragmenter(100));
// 设置高亮 设置 title,content 字段
/*String title = targetDoc.get("titleResult");
String contents = targetDoc.get("content");
TokenStream titleTokenStream = analyzer.tokenStream(titleResult,new StringReader(title));
TokenStream contentTokenStream = analyzer.tokenStream(content,new StringReader(contents));
String highLightTitle = highlighter.getBestFragment(titleTokenStream, title);
String highLightContent = highlighter.getBestFragment(contentTokenStream, contents);
if(highLightTitle == null){
highLightTitle = title;
}
if(highLightContent == null) {
highLightContent = content;
}
rg.setLink(targetDoc.get(link));
rg.setTitleResult(highLightTitle);
rg.setContent(highLightContent);
rg.setReleaseDate(new SimpleDateFormat("yyyy-MM-dd hh:mm:ss").parse(targetDoc.get(releaseDate)+" 00:00:00"));
list.add(rg);*/
}
indexSearcher.close();
return list;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
//
public static List<String> getDirPath(String path){
List<String> dirPaths=new ArrayList<String>();
File f=new File(path);
File files[]=f.listFiles();
if(files.length==0){
System.out.println("没有存放索引的文件夹");
}else{
for(int i=0;i<files.length;i++){
//检索每个存放索引的文件夹
dirPaths.add(files[i].getAbsolutePath());
}
}
return dirPaths;
}
public static Map<String,String> getInfos(String path,String str[],String titleOrContent){
long begin = new Date().getTime();
Map<String,String> map=new HashMap<String,String>();
//获取存放索引的所有文件夹
List<String> dirPaths=GetResultInfo.getDirPath(path);
for(int k=0;k<str.length;k++){
for(int i=0;i<dirPaths.size();i++){
List<String> infoList=GetResultInfo.getResultInfos(str[k],dirPaths.get(i),titleOrContent);
for(int j=0;j<infoList.size();j++){
map.put(infoList.get(j),infoList.get(j));
}
}
}
long end = new Date().getTime();
System.out.println(">>> 搜索完毕... ... 共花费:" + (end - begin) +"毫秒...");
System.out.println("一共检索到"+map.size()+"条");
return map;
}
}
这个例子是我对 解析到的网页的 路径、标题、内容、发布日期、来源 进行建索
然后通过对内容的检索 获取该网页的路径