Baidu Encyclopedia crawler

Posted by kryles on Sat, 02 May 2020 22:45:32 +0200

WebMagic configuration: http://www.cnblogs.com/justcooooode/p/7913365.html

Crawling target

Recently, we need to analyze some domain concepts, choose to use Baidu Encyclopedia to crawl corresponding terms, and get information and knowledge from them.

I divide the query results into two categories, one is included in the encyclopedia, the other is not included, although not included, some information can be obtained from it.

For the words not included, some related terms will be given, so this part is also used as crawling content.

For the entries that have been included, the information I need is the title, synonym, abstract, Infobox, baidu intimate content.

Code

  1 /** 
  2 * @Title: BaiduBaikeSpider.java
  3 * @Package: edu.heu.kg.spider
  4 * @Description: TODO
  5 * @author:LJH
  6 * @date:2017 9:43:16 am, December 1, 2016
  7 */
  8 package edu.heu.kg.spider;
  9 
 10 import java.io.BufferedReader;
 11 import java.io.BufferedWriter;
 12 import java.io.FileOutputStream;
 13 import java.io.FileReader;
 14 import java.io.IOException;
 15 import java.io.OutputStreamWriter;
 16 import java.io.UnsupportedEncodingException;
 17 import java.util.LinkedHashMap;
 18 import java.util.LinkedList;
 19 import java.util.List;
 20 import java.util.Map;
 21 import java.util.regex.Matcher;
 22 import java.util.regex.Pattern;
 23 
 24 import us.codecraft.webmagic.Page;
 25 import us.codecraft.webmagic.Site;
 26 import us.codecraft.webmagic.Spider;
 27 import us.codecraft.webmagic.pipeline.ConsolePipeline;
 28 import us.codecraft.webmagic.processor.PageProcessor;
 29 import us.codecraft.webmagic.selector.Html;
 30 import us.codecraft.webmagic.selector.JsonPathSelector;
 31 
 32 public class BaiduBaikeSpider implements PageProcessor {
 33 
 34     // Grab configuration
 35     private Site site = Site.me().setRetryTimes(3).setSleepTime(500).setCharset("UTF-8").setUserAgent(
 36             "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36");
 37     // Key words
 38     private static String word;
 39     // Storage path
 40     private static final String PATH = "D:\\Encyclopedic entry\\Baidu Encyclopedia\\";
 41 
 42     @Override
 43     public Site getSite() {
 44         return site;
 45     }
 46 
 47     @Override
 48     public void process(Page page) {
 49 
 50         if (page.getUrl().regex("https://baike.baidu.com/item/.*").match()) {
 51 
 52             Html html = page.getHtml();
 53 
 54             if (html.xpath("/html/body/div[3]/div[2]/div/div[2]/dl[1]/dd/h1/text()").match()) {
 55 
 56                 // Title
 57                 String title = html.xpath("/html/body/div[3]/div[2]/div/div[2]/dl[1]/dd/h1/text()").toString();
 58                 // synonym
 59                 String synonym = html.xpath("/html/body/div[3]/div[2]/div/div[2]/span/span/text()").all().toString();
 60                 // abstract
 61                 String summary = html.xpath("/html/body/div[3]/div[2]/div/div[2]/div[@class='lemma-summary']/allText()")
 62                         .all().toString().replaceAll(",", "");
 63                 // Message box
 64                 String infobox = html
 65                         .xpath("/html/body/div[3]/div[2]/div/div[2]/div[@class='basic-info cmn-clearfix']/dl/allText()")
 66                         .all().toString();
 67 
 68                 StringBuilder sb = new StringBuilder();
 69                 sb.append(word + "\t" + title + "\t" + synonym + "\t" + summary + "\t" + infobox + "\n");
 70                 try {
 71                     outPut(PATH + "Encyclopedic entry.txt", sb.toString());
 72                 } catch (IOException e) {
 73                     e.printStackTrace();
 74                 }
 75 
 76                 // Baidu Zhixin,Due to dynamic loading, get id
 77                 String zhixinID = html.xpath("//*[@id=\"zhixinWrap\"]/@data-newlemmaid").toString();
 78                 String zhixinURL = "https://baike.baidu.com/wikiui/api/zhixinmap?lemmaId=" + zhixinID;
 79                 // Add task
 80                 page.addTargetRequest(zhixinURL);
 81 
 82             }
 83             // If there is no entry, get the most relevant entry and add url To task
 84             else {
 85                 page.addTargetRequest("https://baike.baidu.com/search/none?word=" + word);
 86             }
 87         }
 88         // Analysis of Baidu Zhixin Json
 89         else if (page.getUrl().regex("https://baike.baidu.com/wikiui/.*").match()) {
 90 
 91             Map<String, List<String>> resultMap = new LinkedHashMap<>();
 92             // Get the number of related types, because some words are two, some are three
 93             List<String> tempList = new JsonPathSelector("$.[*]").selectList(page.getRawText());
 94             int typeNums = tempList.size();
 95             int count = typeNums;
 96             while (count > 0) {
 97                 resultMap.put(new JsonPathSelector("$.[" + (typeNums - count) + "].tipTitle").select(page.getRawText()),
 98                         new JsonPathSelector("$.[" + (typeNums - count) + "].data[*].title")
 99                                 .selectList(page.getRawText()));
100                 count--;
101             }
102 
103             StringBuilder sb = new StringBuilder();
104             sb.append(word + "\t");
105             resultMap.forEach((key, value) -> {
106                 sb.append(key + ": " + value + "\t");
107             });
108             sb.append("\n");
109             try {
110                 outPut(PATH + "related term _Intimate.txt", sb.toString());
111             } catch (IOException e) {
112                 e.printStackTrace();
113             }
114         }
115         // Baidu Encyclopedia has not included entries yet. Get related entries
116         else if (page.getUrl().regex("https://baike.baidu.com/search/none\\?word=.*").match()) {
117 
118             List<String> list = page.getHtml().xpath("//*[@id=\"body_wrapper\"]/div[1]/dl/dd/a/@href").all();
119             try {
120                 list = UrlDecode(list);
121             } catch (UnsupportedEncodingException e) {
122                 e.printStackTrace();
123             }
124 
125             StringBuilder sb = new StringBuilder();
126             sb.append(word + "\t" + list + "\n");
127             try {
128                 outPut(PATH + "related term _Not included.txt", sb.toString());
129             } catch (IOException e) {
130                 // TODO Auto-generated catch block
131                 e.printStackTrace();
132             }
133 
134         } else
135             System.out.println("Nothing!");
136 
137     }
138 
139     // URL Decode
140     @SuppressWarnings("unused")
141     private static List<String> UrlDecode(List<String> rawList) throws UnsupportedEncodingException {
142 
143         List<String> resultList = new LinkedList<>();
144         String reg = "https://baike.baidu.com/item/(.*)/\\d+";
145         Pattern p = Pattern.compile(reg);
146         Matcher m;
147         for (String str : rawList) {
148             m = p.matcher(str);
149             if (m.find()) {
150                 resultList.add(java.net.URLDecoder.decode(m.group(1), "UTF-8"));
151             }
152         }
153         return resultList;
154 
155     }
156 
157     // storage
158     private static void outPut(String path, String content) throws IOException {
159 
160         FileOutputStream fos = new FileOutputStream(path, true);
161         OutputStreamWriter osw = new OutputStreamWriter(fos);
162         BufferedWriter bw = new BufferedWriter(osw);
163         bw.write(content);
164         bw.close();
165 
166     }
167 
168     public static void main(String[] args) throws IOException {
169 
170         try {
171             BufferedReader br = new BufferedReader(new FileReader("D:\\nba.txt"));
172             while ((word = br.readLine()) != null) {
173                 // Establish Spider´╝îaddUrl The parameters of can be variable, but there will be problems
174                 Spider.create(new BaiduBaikeSpider()).addPipeline(new ConsolePipeline())
175                         .addUrl("https://baike.baidu.com/item/" + word).run();
176             }
177             br.close();
178         } catch (IOException e1) {
179             e1.printStackTrace();
180         }
181 
182     }
183 
184 }

Result

Basic information of the entry:

Entry related information:

For reprint, please indicate the original link: http://www.cnblogs.com/justcooooode/p/8068045.html

Topics: Java Windows JSON