Note: build a simple file search system based on Springboot+Lucene

Posted by txmedic03 on Sun, 19 Apr 2020 16:23:54 +0200

Different types of files are stored in the file storage system. The background extracts the file name and content through the program. Lucene is used to index the file name and content. The front end provides a query interface for the user. After the user submits the query keywords, the index library is retrieved and the matching documents are returned to the front page.

The simple architecture design is carried out according to the above figure, and two test documents are prepared. Open source tool Tika is used to complete information extraction, Lucene is used to build index, and Html page is used to provide user query interface. The core code is as follows: build index and query index server code.

/**
     * Search based on user input
     *
     * @param search Enter search content
     * @return Result set
     */
    public List<FileModel> findByTerm(String search) throws IOException {
        if(Strings.isNullOrEmpty(search)){
            return Lists.newArrayList();
        }
        List<FileModel> hitList = new ArrayList<>();
        String[] fields = {"title", "content"};
        ClassPathResource cpr = new ClassPathResource("indexdir");
        Path path = Paths.get(cpr.getFile().toURI());
        Directory dir;
        try {
            dir = FSDirectory.open(path);
            IndexReader reader = DirectoryReader.open(dir);
            IndexSearcher searcher = new IndexSearcher(reader);
            Analyzer analyzer = new IKAnalyzer6x(true);
            MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
            Query query = parser.parse(search);
            //Custom highlight
            SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span style=\"color:red;\">", "</span>");
            //Headline highlight
            QueryScorer scorerTitle = new QueryScorer(query, fields[0]);
            Highlighter highlightTitle = new Highlighter(formatter, scorerTitle);
            //Content highlighting
            QueryScorer scorerContent = new QueryScorer(query, fields[1]);
            Highlighter highlightContent = new Highlighter(formatter, scorerContent);
            TopDocs topDocs = searcher.search(query, 10);
            for (ScoreDoc sd : topDocs.scoreDocs) {
                Document doc = searcher.doc(sd.doc);
                String title = doc.get("title");
                String content = doc.get("content");
                TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), sd.doc, fields[0], analyzer);
                Fragmenter fragmenter = new SimpleSpanFragmenter(scorerTitle);
                highlightTitle.setTextFragmenter(fragmenter);
                String hlTitle = highlightTitle.getBestFragment(tokenStream, title);
                //Get content highlight clip
                tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), sd.doc, fields[1], analyzer);
                fragmenter = new SimpleSpanFragmenter(scorerContent);
                highlightContent.setTextFragmenter(fragmenter);
                String hlContent = highlightContent.getBestFragment(tokenStream, content);
                FileModel fm = new FileModel(hlTitle != null ? hlTitle : title, hlContent != null ? hlContent : content);
                hitList.add(fm);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return hitList;
    }


/**
     * Get extract file collection
     *
     * @return
     * @throws Exception
     */
    public List<FileModel> extractFile() throws Exception {
        ArrayList<FileModel> list = new ArrayList<>();
        ClassPathResource cpr = new ClassPathResource("doc");
        if (!cpr.getFile().exists()) {
            return list;
        }
        for (File file : cpr.getFile().listFiles()) {
            FileModel fm = new FileModel(file.getName(), parseExtraction(file));
            list.add(fm);
        }
        return list;
    }

    /**
     * File content extraction
     *
     * @param file File object
     * @return content: Document content
     */
    public String parseExtraction(File file) {
        String content = "";
        BodyContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
        Parser parser = new AutoDetectParser();
        Metadata metadata = new Metadata();
        FileInputStream inputStream;
        try {
            inputStream = new FileInputStream(file);
            ParseContext context = new ParseContext();
            parser.parse(inputStream, handler, metadata, context);
            content = handler.toString();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return content;
    }


    /**
     * Create file index
     */
    public void createIndex() throws Exception {
        //IK participator NEW
        Analyzer analyzer = new IKAnalyzer6x();
        IndexWriterConfig icw = new IndexWriterConfig(analyzer);
        icw.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        Resource resource = new ClassPathResource("indexdir");
        FieldType fieldType = new FieldType();
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        fieldType.setStored(true);
        fieldType.setTokenized(true);
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
        fieldType.setStoreTermVectorOffsets(true);
        Date now = new Date();
        Directory dir = FSDirectory.open(resource.getFile().toPath());
        IndexWriter indexWriter = new IndexWriter(dir, icw);
        List<FileModel> fileModelList = extractFile();
        for (FileModel fm : fileModelList) {
            Document doc = new Document();
            doc.add(new Field("title", fm.getTitle(), fieldType));
            doc.add(new Field("content", fm.getContent(), fieldType));
            indexWriter.addDocument(doc);
        }
        indexWriter.close();
        Date end = new Date();
        System.out.println("Index document completed, total time:" + (end.getTime() - now.getTime()) + "Millisecond");
    }

<!DOCTYPE html>
<html lang="en" xmlns:th="http://www.thymeleaf.org" xmlns:v-on="http://www.w3.org/1999/xhtml">
<head>
    <meta charset="UTF-8">
    <title>home page</title>
    <link th:href="@{/css/index.css}" rel="stylesheet" type="text/css">
    <script th:src="@{/js/vue.min.js}"></script>
    <script th:src="@{/js/axios.min.js}"></script>
</head>
<body>
<div id="app">
    <div id="search" class="search-input">
        <div class="si"><input ref='input_content' class="text-input" type="text"/></div>
        <div class="sb">
            <button class="btn-search" type="button" v-on:click="search">
                <svg t="1587265580322" class="icon" viewBox="0 0 1024 1024" version="1.1"
                     xmlns="http://www.w3.org/2000/svg" p-id="1914" width="28" height="28">
                    <path d="M712.5 645.3l161.2 161.2c26.3 26.3 32.7 63.1 14.2 81.6-18.5 18.5-55.3 12.2-81.6-14.2L645.1 712.7"
                          fill="#FFEABB" p-id="1915"></path>
                    <path
                            d="M859.7 928.6c-4.9 0-9.8-0.4-14.9-1.3-21.5-3.7-42.7-15.2-59.8-32.3L623.9 733.9l42.4-42.4 161.2 161.2c16.3 16.3 34.9 18.5 39.2 14.2 4.3-4.3 2.1-22.9-14.2-39.2L691.3 666.5l42.4-42.4 161.2 161.2c17.1 17.1 28.5 38.3 32.3 59.8 4.3 25-2.2 48.4-18.1 64.2-12.7 12.6-30.1 19.3-49.4 19.3z"
                            fill="#F9C73D" p-id="1916"></path>
                    <path
                            d="M443.3 806.5c-49 0-96.6-9.6-141.4-28.6-43.3-18.3-82.1-44.5-115.4-77.8-33.3-33.3-59.5-72.2-77.8-115.4C89.6 539.8 80 492.3 80 443.3s9.6-96.6 28.6-141.4c18.3-43.3 44.5-82.1 77.8-115.4s72.2-59.5 115.4-77.8C346.7 89.6 394.2 80 443.3 80s96.6 9.6 141.4 28.6c43.3 18.3 82.1 44.5 115.4 77.8 33.3 33.3 59.5 72.2 77.8 115.4 18.9 44.8 28.6 92.4 28.6 141.4s-9.6 96.6-28.6 141.4c-18.3 43.3-44.5 82.1-77.8 115.4-33.3 33.3-72.2 59.5-115.4 77.8-44.9 19.1-92.4 28.7-141.4 28.7z m0-666.5C276 140 140 276 140 443.3c0 167.2 136 303.2 303.2 303.2s303.2-136 303.2-303.2C746.5 276 610.5 140 443.3 140z"
                            fill="#F9C73D" p-id="1917"></path>
                </svg>
            </button>
        </div>
    </div>
    <div class="result-list">
        <ul>
            <li v-for='item in resultList'>
                <p class="p_title" v-html="item.title">{{item.title}}</p>
                <p class="p_content" v-html="item.content">{{ item.content }}</p>
            </li>
        </ul>
    </div>

</div>
<script type="text/javascript" th:src="@{/js/index.js}"></script>
</body>
</html>

An example of query results is shown as follows:

This learning is mainly to string the previous knowledge, Lucene index construction, index query, search term analysis, multi domain query, search term highlighting, including vue.js and h+c knowledge that haven't been written for a long time.

Topics: Programming Vue Thymeleaf axios Javascript

Programmer Think

Note: build a simple file search system based on Springboot+Lucene

Hot Topics