Pangu participle+unary/binary participle Lucene

Posted by ahmadmunif on Sun, 08 Mar 2020 17:24:48 +0100

This article is referenced from: https://blog.csdn.net/mss359681091/article/details/52078147

All file downloads needed in this article contain items:

Lucene Profile Download

Download Zip for this project

Create a Windows Forms application with vs2015. When you create a project, remember to change its properties to Console Application. Of course, it can also be the default, just for convenience.As follows

In addition, you need to reference'Lucene.Net.dll'

1. Monary Word Separation

Unary participle

 

2. Binary Word Separation

On that basis, reference the two.cs files in the folder Analyzers, as shown below

 

 /// <summary>
        /// Lucene CJK Analyzer
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button2_Click(object sender, EventArgs e)
        {
            Analyzer analyzer = new CJKAnalyzer(); // Standard participle → Unary participle  
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader("It's impossible to drink only plain milk - Fireflies in the dark"));
            Token token = null;
            while ((token = tokenStream.Next()) != null) // Do not return as long as there are words left null  
            {
                string word = token.TermText(); // token.TermText() Get the current participle  
                Console.Write(word + "   |  ");
            }
        }
Binary Word Separation

3. Pangu Word Separation

Reference the following two profiles again

/// <summary>
        /// Pangu Word Separation
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void button3_Click(object sender, EventArgs e)
        {
            Analyzer analyzer = new PanGuAnalyzer(); // Standard participle → Unary participle  
            TokenStream tokenStream = analyzer.TokenStream("", new StringReader("It's impossible to drink only plain milk - Fireflies in the dark"));
            Token token = null;
            while ((token = tokenStream.Next()) != null) // Do not return as long as there are words left null  
            {
                string word = token.TermText(); // token.TermText() Get the current participle  
                Console.Write(word + "   |  ");
            }
        }
Pangu Word Separation

 

4. Simple Search

Create a web form, SearchWords.aspx, as shown below

 

<%@ Page Language="C#" AutoEventWireup="true" CodeBehind="SearchWords.aspx.cs" Inherits="PanGu_Search.Views.SearchWords" %>

<!DOCTYPE html>

<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
    <title>The simplest search engine</title>  
    <script>
    $(document).keydown(function (event) {
    if (event.keyCode == 13) {
        $("#btnGetSearchResult").click();
    }
    });
    </script>
</head>  
<body>  
    <form id="mainForm" runat="server">  
        <div align="center">  
            <asp:Button ID="btnCreateIndex" runat="server" Text="Create Index" OnClick="btnCreateIndex_Click" />  
            <asp:Label ID="lblIndexStatus" runat="server" Visible="false" />  
            <hr />  
            <asp:TextBox ID="txtKeyWords" runat="server" Text="" Width="250"></asp:TextBox>  
            <asp:Button ID="btnGetSearchResult" runat="server" Text="Search" OnClick="btnGetSearchResult_Click" />  
            <hr />  
        </div>  
        <div>  
            <ul>  
                <asp:Repeater ID="rptSearchResult" runat="server">  
                    <ItemTemplate>  
                        <li>Id:<%#Eval("Id") %><br /><%#Eval("Msg") %></li>  
                    </ItemTemplate>  
                </asp:Repeater>  
            </ul>  
        </div>  
    </form>  
</body>  
</html>  
Front end aspx design
  /// <summary>
        /// Create Index Method
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btnCreateIndex_Click(object sender, EventArgs e)
        {
            string indexPath = Context.Server.MapPath("~/Index"); // Index Document Save Location  
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NativeFSLockFactory());
            bool isUpdate = IndexReader.IndexExists(directory); //Determine if an index library exists  
            if (isUpdate)
            {
                //  If the index directory is locked (for example, if the program exits abnormally during indexing), unlock it first  
                //  Lucene.Net The index libraries are automatically locked before they are written. close When unlocked automatically  
                //  It cannot be multithreaded; it can only handle situations where an unexpected lock is permanently held  
                if (IndexWriter.IsLocked(directory))
                {
                    IndexWriter.Unlock(directory);  //unlock:Forced unlock, to be optimized  
                }
            }
            //  Create Write Operation Object to Index Library  IndexWriter(index catalogue,Specify the use of Pangu participle for word segmentation,Maximum Write Length Limit)  
            //  supplement:Use IndexWriter open directory Will automatically lock the index library file  
            IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), !isUpdate,
                IndexWriter.MaxFieldLength.UNLIMITED);

            for (int i = 1; i < 3; i++)
            {
                string txt = File.ReadAllText(Context.Server.MapPath("~/Upload/Articles/") + i + ".txt");
                //  One Document Equivalent to a record  
                Document document = new Document();
                //  each Document You can have your own properties (fields), all field names are custom, values are string type  
                //  Field.Store.YES Not only should the article be recorded as a word breaker, but also the original text be saved, so that you do not have to go to the database to look it up once  
                document.Add(new Field("id", i.ToString(), Field.Store.YES, Field.Index.NOT_ANALYZED));
                //  Fields to be retrieved in full text Field.Index. ANALYZED  
                //  Field.Index.ANALYZED:Specify that the content of the article is saved according to the result after word breaking, otherwise the subsequent fuzzy query cannot be implemented   
                //  WITH_POSITIONS_OFFSETS:Indicates that not only the segmented words are saved, but also the distance between them  
                document.Add(new Field("msg", txt, Field.Store.YES, Field.Index.ANALYZED,
                    Field.TermVector.WITH_POSITIONS_OFFSETS));
                //  Prevent duplicate index, delete 0 if none exists  
                writer.DeleteDocuments(new Term("id", i.ToString()));// Prevent existing data => delete from t where id=i  
                                                                     //  Write Documents to Index Library  
                writer.AddDocument(document);
                Console.WriteLine("Indexes{0}Finished creating", i.ToString());
            }

            writer.Close(); // Close Automatically unlock index library files after  
            directory.Close();  //  Don't forget Close,Otherwise the index results will not be found  

            lblIndexStatus.Text = "Index file created successfully!";
            lblIndexStatus.Visible = true;
            btnCreateIndex.Enabled = false;
        }
Create Index Method
/// <summary>
        /// Search Method
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        protected void btnGetSearchResult_Click(object sender, EventArgs e)
        {
            string keyword = txtKeyWords.Text;

            string indexPath = Context.Server.MapPath("~/Index"); // Index Document Save Location  
            FSDirectory directory = FSDirectory.Open(new DirectoryInfo(indexPath), new NoLockFactory());
            IndexReader reader = IndexReader.Open(directory, true);
            IndexSearcher searcher = new IndexSearcher(reader);
            // query criteria  
            PhraseQuery query = new PhraseQuery();
            // Equivalent to where contains("msg",kw)  
            query.Add(new Term("msg", keyword));
            // If the distance between two words is greater than 100 (empirical value), the search results will not be included, because the distance is too far away and the correlation is not high.  
            query.SetSlop(100);
            // TopScoreDocCollector:A container for query results  
            TopScoreDocCollector collector = TopScoreDocCollector.create(1000, true);
            // Use query This query condition searches and the results are put in collector  
            searcher.Search(query, null, collector);
            // Remove item from query result m Article 1 to n Bar data  
            // collector.GetTotalHits()Represents the total number of results  
            ScoreDoc[] docs = collector.TopDocs(0, collector.GetTotalHits()).scoreDocs;
            // Traverse query results  
            IList<SearchResult> resultList = new List<SearchResult>();
            for (int i = 0; i < docs.Length; i++)
            {
                // Get the document id,because Document It can be very memory intensive ( DataSet and DataReader Difference between)  
                int docId = docs[i].doc;
                // So only the query results id,Specific content needs second query  
                // according to id Query content: put in is Document,Did you find it? Document  
                Document doc = searcher.Doc(docId);
                SearchResult result = new SearchResult();
                result.Id = Convert.ToInt32(doc.Get("id"));
                result.Msg = HighlightHelper.HighLight(keyword, doc.Get("msg"));

                resultList.Add(result);
            }

            // Bind to Repeater  
            rptSearchResult.DataSource = resultList;
            rptSearchResult.DataBind();
        }
Search Method

Topics: C# Windows Database