Pro test, java lucene index, read index

Posted by johnali3n on Fri, 22 Nov 2019 17:11:47 +0100

/**
	 * Create index file trigger update database search file
	 * @param contents Index content data
	 * @param searchDir Index file directory
	 */
	public static void createIndex(List<Content> contents, String searchDir, boolean hasDelete) {
		IndexWriter iwriter = null;
		try {
			// Get index file location
			Path dirPath = Paths.get(searchDir);
			if (!Files.exists(dirPath)) {
				Files.createDirectories(dirPath);
			}
			// Set index parameters
			Directory directory = FSDirectory.open(dirPath);
			// Custom stop words
			CharArraySet cas = new CharArraySet(0, true);
			String stopDicPath = LuceneUtils.class.getResource("/stopword.dic").getFile();;
			List<String> lines = FileUtils.readLines(new File(stopDicPath), Constant.CHARSET);
			if (lines != null && lines.size() > 0) {
				for (String line : lines) {
					cas.add(line);
				}
			}
			// Add system default stop words
			Iterator<Object> itor = SmartChineseAnalyzer.getDefaultStopSet().iterator();
			while (itor.hasNext()) {
				cas.add(itor.next());
			}

			Analyzer analyzer = new SmartChineseAnalyzer(cas);
			IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
			iwriter = new IndexWriter(directory, iwConfig);
			if(hasDelete){
				iwriter.deleteAll();// Delete the last index file and rebuild the index
			}
			for (Content content : contents) {
				Document doc = new Document();
				doc.add(new StringField("id", String.valueOf(content.getId()),Store.YES));
				doc.add(new TextField("platform", content.getPlatform().name(),Store.YES));
				doc.add(new TextField("sname", content.getSname(), Store.YES));
				doc.add(new TextField("introduction", content.getIntroduction(), Store.YES));
				doc.add(new TextField("actor", content.getActor(), Store.YES));
				doc.add(new TextField("director", content.getDirector(), Store.YES));
				doc.add(new StringField("picture", content.getPicture(), Store.YES));
				doc.add(new StringField("program_code", content.getProgramCode(), Store.YES));
				doc.add(new TextField("category_code", content.getCategoryCode(), Store.YES));
				// Need to deal with
				doc.add(new TextField("category", content.getClassify().getSname(), Store.YES));
				doc.add(new StringField("dtype", content.getDtype().name(), Store.YES));
				iwriter.addDocument(doc);
			}
			iwriter.close();
			analyzer.close();
			directory.close();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

The above is indexing. What should be paid attention to: 1. The concepts of stop words (without index) and word segmentation device ("I am not the God of medicine" is divided into "I", "I am not", "the God of medicine" and "I am not the God of medicine").

2: the difference between StringFlied and textfile.

Here is the read index:

	public static List<Content> searchIndex(String keyword, PlatformType formType, int quantity) {
		String indexPath = ConfigUtils.getValue(Constant.LUCENE_INDEX_FOLDER);
		if(formType == null){
			formType = PlatformType.HW;
		}
		List<Content> contents = new ArrayList<Content>();
		Path path = Paths.get(indexPath);
		try {
			DirectoryReader reader = DirectoryReader.open(FSDirectory.open(path));
			IndexSearcher searcher = new IndexSearcher(reader);
			Analyzer analyzer = new SmartChineseAnalyzer();
			String[] fields = {"sname"};
			String[] stringQuery = {keyword};
			Query multiQuery = MultiFieldQueryParser.parse(stringQuery, fields,analyzer);
			//Query limitQuery = new TermQuery(new Term("platform", formType.name()));
			QueryBuilder queryBuilder = new QueryBuilder(analyzer);
			Query limitQuery = queryBuilder.createBooleanQuery("platform", formType.name(), Occur.MUST);
			BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
			booleanQuery.add(multiQuery, Occur.MUST);
			booleanQuery.add(limitQuery, Occur.MUST);
			TopDocs hits = searcher.search(booleanQuery.build(), quantity);
			ScoreDoc[] scoreDocs = hits.scoreDocs;
			for (ScoreDoc sd : scoreDocs) {
				Document doc = searcher.doc(sd.doc);
				String platformStr = doc.get("platform");
				PlatformType platform = null;
				if(StringUtils.isNotBlank(platformStr)) {
					platform = PlatformType.valueOf(platformStr);
					if(formType != null && formType != platform) {
						continue;
					}
				}
				Content content = new Content();
				String dtypeStr = doc.get("dtype");
				if(StringUtils.isNotBlank(dtypeStr)) {
					content.setDtype(MediaType.valueOf(dtypeStr));
				}
				content.setId(Integer.parseInt(doc.get("id")));
				content.setActor(doc.get("actor"));
				content.setIntroduction(doc.get("introduction"));
				content.setSname(doc.get("sname"));
				content.setPicture(doc.get("picture"));
				content.setCategoryCode(doc.get("category_code"));
				String category = doc.get("category");
				if (StringUtils.isNotBlank(category)) {
					content.setClassify(new Classify());
					content.getClassify().setSname(category);
				}
				content.setPlatform(platform);
				contents.add(content);
			}
			analyzer.close();
			reader.close();
		} catch (IOException e) {
			e.printStackTrace();
		} catch (Exception e) {
			e.printStackTrace();
		}
		return contents;
	}

 

Required dependencies (indexing):

 <!--Import luence  -->
		 <dependency>
		    <groupId>org.apache.lucene</groupId>
		    <artifactId>lucene-analyzers-common</artifactId>
		    <version>${lucene.version}</version>
		</dependency>
		 <dependency>
		    <groupId>org.apache.lucene</groupId>
		    <artifactId>lucene-analyzers-smartcn</artifactId>
		    <version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queries</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-sandbox</artifactId>
			<version>${lucene.version}</version>
		</dependency>

Read index dependency

<!--Import luence -->
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-common</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-analyzers-smartcn</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-core</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queryparser</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-queries</artifactId>
			<version>${lucene.version}</version>
		</dependency>
		<dependency>
			<groupId>org.apache.lucene</groupId>
			<artifactId>lucene-sandbox</artifactId>
			<version>${lucene.version}</version>
		</dependency>

Note: some tool classes for judging paths, such as Files tool classes, depend on the specific situation. There are many tools for judging documents (online search). The path where the Files are stored depends on the situation.

2: the lucene version involved is < lucene. Version > 7.5.0 < / lucene. Version >

Topics: Apache Database