Lucene全文檢索應用

一背景接口方法：

1.建立索引

/**
     * 建立知識索引
     *
     * @param flag
     * @param path
     * @param indexPath
     * @param title
     * @param knowlegeid
     */
    public void createKnowledgeIndex(boolean flag, String path, String indexPath, String title, Date createtime, String infoType, String knowlegeid) {
        // 根據類目代碼找到類目，獲得檔案及檔案索引目錄
        File indexDir = new File(indexPath + "/knowledgeIndex");
        if (!indexDir.exists()) {
            if (!indexDir.mkdirs()) throw new RuntimeException("索引檔案夾建立出錯");
        }
        // 建立分詞
        Analyzer luceneAnalyzer = new IK_CAnalyzer();
        // 取得目錄下所有Files
        // 建立indexWrite indexWrite主要作用是添加索引,并判斷索引目錄是否有索引檔案
        File[] file = indexDir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.equals("segments.gen");
            }
        });
        if (file == null || file.length == 0) {
            flag = true;
        }
        IndexWriter indexWriter = null;
        long startTime = new Date().getTime();
        try {
            indexWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
            File dataFiles = new File(path);
            String txtReader = "";
            if (!"".equals(path) && path != null) {
                txtReader = postFix(dataFiles, fileParseDomain);
                Document document = new Document();
                document.add(new Field("path", dataFiles.getCanonicalPath(),
                        Field.Store.YES, Field.Index.NOT_ANALYZED));
                document.add(new Field("title", title, Field.Store.YES,
                        Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                document.add(new Field("createtime", StringUtil.getDateStringYMD(createtime),
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("infoType", infoType,
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("knowlegeid", knowlegeid,
                        Field.Store.YES, Field.Index.ANALYZED));
                document.add(new Field("contents", txtReader,
                        Field.Store.COMPRESS, Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                document.add(new Field("all", title + txtReader,
                        Field.Store.COMPRESS, Field.Index.ANALYZED,
                        Field.TermVector.WITH_OFFSETS));
                indexWriter.addDocument(document);
            }
            indexWriter.optimize();
        } catch (IOException e) {
            log.error("索引建立出錯" + e.getMessage(), e);
            throw new RuntimeException("索引建立出錯" + e.getMessage());
        } finally {
            try {
                indexWriter.close();
            } catch (CorruptIndexException e) {
                log.error(e.getMessage());
                throw new RuntimeException("關閉寫索引流出錯" + e.getMessage());
            } catch (IOException e) {
                log.error(e.getMessage());
                throw new RuntimeException("關閉寫索引流出錯" + e.getMessage());
            }
        }
        long endTime = new Date().getTime();
        log.info("建立索引總時間：" + (endTime - startTime));
    }

2.根據id和路徑删除索引：

public void delKnowledgeIndexByinfoid(String path,String knowlegeid, String indexPath) {
       Directory directory = null;
        try {
            directory = FSDirectory.getDirectory(indexPath + "/knowledgeIndex");
            Term[] termArr = new Term[2];
            termArr[0] = new Term("path", path);
            termArr[1] = new Term("knowlegeid", knowlegeid);
            Analyzer luceneAnalyzer = new IK_CAnalyzer();
            IndexWriter indexWriter = new IndexWriter(directory,
                    luceneAnalyzer, false);
            indexWriter.deleteDocuments(termArr);
            indexWriter.optimize();
            indexWriter.close();
        } catch (IOException e) {
            log.debug("索引删除出錯" + e.getMessage(), e);
            throw new RuntimeException("索引删除出錯" + e.getMessage());
        }
    }

3.查詢索引

/**
     * 根據關鍵字檢索知識資訊
     *
     * @param type
     * @param keyWord
     * @param indexPath
     * @param sp
     * @return
     */
    public List searchKnowlegeByKey(String type, String keyWord, String indexPath, SplitPage sp) {
        keyWord = specialStrConvert(keyWord);
        // 索引位址
        File indexDir = new File(indexPath + "/knowledgeIndex");
        // 取得索引字典
        FSDirectory directory = null;
        IndexSearcher searcher = null;
        Hits hits = null;
        List list = null;
        try {
            directory = FSDirectory.getDirectory(indexDir, false);
            IndexReader reader = IndexReader.open(directory);
            searcher = new IndexSearcher(directory);
            // 查詢的索引位址是否存在
            if (!indexDir.exists()) {
                log.debug("索引檔案不存在");
                throw new RuntimeException("索引檔案不存在");
            }
            // 建立term 查詢docuemnt中contents中的内容（内容要轉為大字）
            Analyzer luceneAnalyzer = new IK_CAnalyzer();
            QueryParser parser = new QueryParser(type, luceneAnalyzer);
            parser.setAllowLeadingWildcard(true);
            Query query = null;
            query = parser.parse("+(" + type + ":*" + keyWord + "*)");
            // 生成結果
            Sort sort = new Sort(new SortField[]{new SortField("createtime", false)});//對索引結果排序
            hits = searcher.search(query, sort);
            // 分詞結果
            list = new ArrayList();
            SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter(
                    "<b><span style='background-color:yellow;'>", "</span></b>");
            Highlighter highlighter = new Highlighter(sHtmlF, new QueryScorer(
                    query));
            if (hits != null && hits.length() > 0) {
                int len = hits.length();
                sp.setRecordCount(len);
                sp.init();
                int endRecord = sp.getStartRecord() + sp.getPageSize();
                int con = endRecord > sp.getRecordCount() ? sp.getRecordCount()
                        : endRecord;
                for (int i = sp.getStartRecord(); i < con; i++) {
                    Document docTemp = hits.doc(i);
                    String value = docTemp.get(type);
                    // 對要高亮顯示的字段格式化，這裡隻是加紅色顯示和加粗
                    Map m = new HashMap();
                    m.put("path", docTemp.get("path"));
                    m.put("title", docTemp.get("title"));
                    m.put("createtime", docTemp.get("createtime"));
                    m.put("infoType", docTemp.get("infoType"));
                    m.put("knowlegeid", docTemp.get("knowlegeid"));
                    if (value != null && !type.equals("title")) {
                        // Lucene使用項向量提高高亮顯示性能
                        TermPositionVector termFreqVector = (TermPositionVector) reader
                                .getTermFreqVector(hits.id(i), type);
                        TokenStream tokenStream = TokenSources
                                .getTokenStream(termFreqVector);
                        String str = highlighter.getBestFragment(tokenStream,
                                value);
                        m.put(type, str);
                    }
                    list.add(m);

                }
            }
            // long ll = System.currentTimeMillis();
            // System.out.println("高亮顯示" + (ll - l));
            searcher.close();
            reader.close();
        } catch (IOException e) {
            log.debug(e.getMessage(), e);
            throw new RuntimeException(e.getMessage());
        } catch (ParseException e) {
            log.debug("lucene分詞轉換出錯" + e.getMessage(), e);
            throw new RuntimeException("lucene分詞轉換出錯" + e.getMessage());
        }
        return list;
    }

4.判定是否存在索引

public String isExistsKnowlegeIndex(String path) {
        String mes;
        //獲得檔案及檔案索引目錄
        File indexDir = new File(path + "/knowledgeIndex");
        if (!indexDir.exists()) {
            if (!indexDir.mkdirs()) throw new RuntimeException("索引檔案夾建立出錯");
        }
        // 建立indexWrite indexWrite主要作用是添加索引,并判斷索引目錄是否有索引檔案
        File[] file = indexDir.listFiles(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                return name.equals("segments.gen");
            }
        });
        if (file == null || file.length == 0) {
            mes = "";
        } else {
            mes = "ok";
        }
        return mes;
    }

5.替換特殊字元

/**
     * 替換特殊字元
     *
     * @param str
     * @return
     */
    private static String specialStrConvert(String str) {
        // + - && || ! ( ) { } [ ] ^ " ~ * ? : \
        if ("".equals(str) || str == null)
            return "";
        else
            return str.replaceAll("\\\\", "\\\\\\\\")
                    .replaceAll("\\+", "\\\\+").replaceAll("\\-", "\\\\-")
                    .replaceAll("\\&&", "\\\\&&").replaceAll("\\!", "\\\\!")
                    .replaceAll("\\(", "\\\\(").replaceAll("\\)", "\\\\)")
                    .replaceAll("\\{", "\\\\{").replaceAll("\\}", "\\\\}")
                    .replaceAll("\\[", "\\\\[").replaceAll("\\]", "\\\\]")
                    .replaceAll("\\^", "\\\\^").replaceAll("\"", "\\\\\"")
                    .replaceAll("\\~", "\\\\~").replaceAll("\\*", "\\\\*")
                    .replaceAll("\\?", "\\\\?").replaceAll("\\|\\|", "\\\\||")
                    .replaceAll("\\:", "\\\\:");
    }

6.讀取不同檔案（方法在Java基礎中工具類中實作）

private static String postFix(File file, FileParseDomain fileParseDomain) {
        String txtReader = "";
        try {
            if (file.getPath().endsWith(".doc")) {
                txtReader = fileParseDomain.readWord(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".pdf")) {
                txtReader = fileParseDomain.readPDF(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".xls")) {
                txtReader = fileParseDomain.readExcel(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".txt")) {
                txtReader = fileParseDomain.readTxt(file.getCanonicalPath());
            } else if (file.getPath().endsWith(".html")
                    || file.getPath().endsWith(".htm")) {
                txtReader = fileParseDomain.readHtmlText(file.getCanonicalPath());
            }
        } catch (IOException e) {
            log.debug("檔案讀取出錯" + e.getMessage(), e);
            throw new RuntimeException("檔案讀取出錯" + e.getMessage());
        }
        return txtReader;
    }

Lucene全文檢索應用

繼續閱讀

轉：基于lucene實作自己的推薦引擎

基于LUCENE實作自己的推薦引擎

Lucene.net和盤古分詞使用小結

Apache Lucene 5.x 內建中文分詞庫 IKAnalyzer

JFLex使用者手冊中文版安裝與配置運作JFLEX 配置檔案編寫

使用 RediSearch 在 Redis 中進行全文檢索

svn配置權限

Python實作Gauss-Seider疊代法（超全）

MySQL和Lucene索引對比分析1. MySQL索引實作2. Lucene索引實作3. MySQL與Lucence對比參考：

Lucence的基本原理

ElasticSearch：部署ElasticSearch & Kibana

如何增值你的文檔資産?_

lucene 關鍵字高亮

世界因大資料而改變

Mysql 網站大全

專家訪談：搜尋開源力量：Lucene技術前景