Node. News crawler and network search (JS)
Project requirements
1, Reptile part
1. Complete the web page analysis and crawler design of the target website.
2. Crawl no less than 100 pieces of data (each data includes 7 fields, news keywords, news titles, news dates, news authors, news sources, news abstracts and news contents), and store them in the database.
2, Search site section
1. Complete the search function of crawling news content and title in the database, and the search results are displayed in the front page in tabular form.
2. Complete the time heat analysis of the search content, and use the table to display the number of Search contents contained in the crawled data content every day.
This paper is the third part of the project: rewrite the crawler as timed crawling
2, Timed crawler rewrite
After this series of articles (1) and (2), we have two nodes JS crawler, because news websites constantly push new news, we need to set a timing function to call the crawler at each specified time to update the crawled news.
1. Review existing codes
crawler_163.js
var crawler_request = require('request'); var crawler_iconv = require('iconv-lite'); var crawler_cheerio = require('cheerio'); require('date-utils'); // Connect database var crawler_sql = require("./crawler_sql.js"); // var crawler_sql = require("mysql"); // var pool = crawler_sql.createPool({ // host: '127.0.0.1', // user: 'root', // password: 'root', // database: 'crawl' // }); // var query = function(sql, sqlparam, callback) { // pool.getConnection(function(err, conn) { // if (err) { // callback(err, null, null); // } else { // conn.query(sql, sqlparam, function(qerr, vals, fields) { // conn.release(); // Release connection // callback(qerr, vals, fields); // Event driven callback // }); // } // }); // }; // var query_noparam = function(sql, callback) { // pool.getConnection(function(err, conn) { // if (err) { // callback(err, null, null); // } else { // conn.query(sql, function(qerr, vals, fields) { // conn.release(); // Release connection // callback(qerr, vals, fields); // Event driven callback // }); // } // }); // }; // exports.query = query; // exports.query_noparam = query_noparam; // Crawl the homepage function request(url, callback) { var options = { url: url, encoding: null, headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36' }, timeout: 10000 } crawler_request(options, callback); }; var crawler_url = 'https://news.163.com/'; crawler(); function crawler() { request(crawler_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Determine whether there are hyperlinks in the web page var url_hrefs; try { url_hrefs = eval("$('a')"); } catch (e) { console.log('Page does not have hyperlink' + e); } // Traverse all hyperlinks in the web page url_hrefs.each(function(i, e) { // Get news var news_url = ""; try { var url_href = ""; url_href = $(e).attr("href"); if (typeof(url_href) == "undefined") { return true; } if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) { news_url = url_href; } else if (url_href.startsWith('//')) { news_url = 'https:' + url_href; } else { news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href; } } catch (e) { console.log('Error getting news page' + e); } // Verify whether the news web page url conforms to the url naming format var news_reg = /\/news\/article\/([a-zA-Z0-9]{16}).html/; var news_reg_special = /\/news\/article\/([a-zA-Z0-9]{8})0001982T.html/; // For example: https://www.163.com/news/article/G8HQOAKE0001899O.html if (!news_reg.test(news_url) || news_reg_special.test(news_url)) { console.log('The news link does not conform to the format!'); return; } // Crawl the news page var news_search_sql = 'select url from news where url=?'; var news_search = [news_url]; crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) { if (vals.length > 0) { console.log('The news page has been crawled!') } else { crawler_news_url(news_url); } }); }); }); } // Crawl news links function crawler_news_url(news_url) { request(news_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Define news information json var news = {}; news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS"); news.url = news_url; news.url_encoding = 'UTF-8'; news.keywords = ''; news.title = ''; news.date = new Date(); news.author = ''; news.source = ''; news.summary = ''; news.content = ''; // Get news keywords try { news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")"); } catch (e) { console.log('News keyword acquisition error:' + e); } // Get news headlines try { news.title = eval("$('title').text()").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('News Title acquisition error:' + e); } // Get news time try { news.date = eval("$('#ne_wrap').eq(0).attr(\"data-publishtime\")"); } catch (e) { console.log('News date acquisition error:' + e); } // Get news author try { news.author = eval("$('.icon').eq(0).attr(\"alt\")"); if (news.author == 'netease') { news.author = eval("$('.post_author').text()").replace(/[\r\n\s]/g, "").replace("Source:", ""); var author_reg = /Responsible editor:.+_/; news.author = author_reg.exec(news.author).toString().replace("Responsible editor:", "").replace("_", ""); } } catch (e) { console.log('News author get error:' + e); } // Access to news sources try { news.source = eval("$('.post_info').children(':first').text()").replace(/[\r\n\s]/g, ""); if (news.source == 'report') { news.source = eval("$('.post_info').prop('firstChild').nodeValue").replace(/[\r\n\s]/g, ""); var source_reg = /.+source:/; var tmp = source_reg.exec(news.source).toString(); news.source = news.source.replace(tmp, ""); } } catch (e) { console.log('News source acquisition error:' + e); } // Get news summary try { news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('News summary get error:' + e); } // Get news content try { news.content = eval("$('.post_body').text()").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('News content acquisition error:' + e); } console.log(JSON.stringify(news)); // Write to database if (news.content != '') { var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'; var news_add = [news.url, news.source, news.url_encoding, news.title, news.keywords, news.author, news.date, news.crawler_time, news.summary, news.content ]; crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) { if (qerr) { console.log(qerr); } }); } }); }
crawler_sina.js
var crawler_request = require('request'); var crawler_iconv = require('iconv-lite'); var crawler_cheerio = require('cheerio'); require('date-utils'); // Connect database var crawler_sql = require("./crawler_sql.js"); // Crawl the homepage function request(url, callback) { var options = { url: url, encoding: null, headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36' }, timeout: 10000 } crawler_request(options, callback); }; var crawler_url = 'https://news.sina.com.cn/'; crawler(); function crawler() { request(crawler_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Determine whether there are hyperlinks in the web page var url_hrefs; try { url_hrefs = eval("$('a')"); } catch (e) { console.log('Page does not have hyperlink' + e); } // Traverse all hyperlinks in the web page url_hrefs.each(function(i, e) { // Get news var news_url = ""; try { var url_href = ""; url_href = $(e).attr("href"); if (typeof(url_href) == "undefined") { return true; } if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) { news_url = url_href; } else if (url_href.startsWith('//')) { news_url = 'https:' + url_href; } else { news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href; } } catch (e) { console.log('Error getting news page' + e); } // Verify whether the news web page url conforms to the url naming format var news_reg = /\/(\d{4})-(\d{2})-(\d{2})\/doc-([a-zA-Z0-9]{15}).shtml/; // For example: https://news.sina.com.cn/c/xl/2021-04-29/doc-ikmyaawc2421496.shtml if (!news_reg.test(news_url)) { console.log('The news link does not conform to the format!'); return; } // Crawl the news page var news_search_sql = 'select url from news where url=?'; var news_search = [news_url]; crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) { if (vals.length > 0) { console.log('The news page has been crawled!') } else { crawler_news_url(news_url); } }); }); }); }; // Crawl news links function crawler_news_url(news_url) { request(news_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Define news information json var news = {}; news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS"); news.url = news_url; news.url_encoding = 'UTF-8'; news.keywords = ''; news.title = ''; news.date = new Date(); news.author = ''; news.source = ''; news.summary = ''; news.content = ''; // Get news keywords try { news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")"); } catch (e) { console.log('News keyword acquisition error:' + e); } // Get news headlines try { news.title = eval("$('title').text()").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('Get news title error:' + e); } // Get news time try { news.date = eval("$('.date').text()"); news.date = news.date.replace('year', '-'); news.date = news.date.replace('month', '-'); news.date = news.date.replace('day', ''); } catch (e) { console.log('News date acquisition error:' + e); } // Get news author try { news.author = eval("$('.show_author').text()").replace("Responsible editor:", ""); } catch (e) { console.log('News author get error:' + e); } // Access to news sources try { news.source = eval("$('meta[name=\"mediaid\"]').eq(0).attr(\"content\")"); } catch (e) { console.log('News source acquisition error:' + e); } // Get news summary try { news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, ""); if (news.summary == "") { news.summary = eval("$('meta[property=\"og:description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, ""); } } catch (e) { console.log('News summary get error:' + e); } // Get news content try { news.content = eval("$('.article').text()").replace(/[\r\n\s]/g, ""); if (news.content == "") { news.content = eval("$('#article_content').text()").replace(/[\r\n\s]/g, ""); } } catch (e) { console.log('News content acquisition error:' + e); } console.log(JSON.stringify(news)); // Write to database if (news.author == '' || news.author == null) { news.author = news.source; } if (news.content != '' && news.source != '') { var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'; var news_add = [news.url, news.source, news.url_encoding, news.title, news.keywords, news.author, news.date, news.crawler_time, news.summary, news.content ]; crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) { if (qerr) { console.log(qerr); } }); } }); }
2. Introduce related packages
New crawler_schedule_163.js and crawler_schedule_sina.js file.
These two files are pairs of corresponding crawlers_ 163.js and crawler_sina.js files are rewritten in the same way
Introduce the required related package node schedule:
// Timed execution var crawler_schedule = require('node-schedule');
Establish timing rules:
var crawler_rule = new crawler_schedule.RecurrenceRule(); // crawler_rule.hour = [0, 12]; // crawler_rule.minute = 5; crawler_rule.second = 0;
Of which The hour method is to call the crawler at the set time. In this case, [0,12] means that the crawler is executed at 0 and 12 The minute method calls the crawler for the set number of minutes. In this case, 5 refers to the fifth minute of each hour The second method calls the crawler for the set number of seconds. In this case, 0 means that the crawler starts at the 0th second of each minute.
Comment out the main function that originally called the crawler:
// crawler();
Replace with:
crawler_schedule.scheduleJob(crawler_rule, function() { crawler(); });
3. Timing crawler code
crawler_schedule_163.js
var crawler_request = require('request'); var crawler_iconv = require('iconv-lite'); var crawler_cheerio = require('cheerio'); require('date-utils'); // Connect database var crawler_sql = require("./crawler_sql.js"); // Timed execution var crawler_schedule = require('node-schedule'); var crawler_rule = new crawler_schedule.RecurrenceRule(); // crawler_rule.hour = [0, 12]; // crawler_rule.minute = 5; crawler_rule.second = 0; crawler_schedule.scheduleJob(crawler_rule, function() { crawler(); }); // Crawl the homepage regularly function request(url, callback) { var options = { url: url, encoding: null, headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36' }, timeout: 10000 } crawler_request(options, callback); }; var crawler_url = 'https://news.163.com/'; function crawler() { request(crawler_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Determine whether there are hyperlinks in the web page var url_hrefs; try { url_hrefs = eval("$('a')"); } catch (e) { console.log('Page does not have hyperlink' + e); } // Traverse all hyperlinks in the web page url_hrefs.each(function(i, e) { // Get news var news_url = ""; try { var url_href = ""; url_href = $(e).attr("href"); if (typeof(url_href) == "undefined") { return true; } if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) { news_url = url_href; } else if (url_href.startsWith('//')) { news_url = 'https:' + url_href; } else { news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href; } } catch (e) { console.log('Error getting news page' + e); } // Verify whether the news web page url conforms to the url naming format var news_reg = /\/news\/article\/([a-zA-Z0-9]{16}).html/; var news_reg_special = /\/news\/article\/([a-zA-Z0-9]{8})0001982T.html/ // For example: https://www.163.com/news/article/G8HQOAKE0001899O.html if (!news_reg.test(news_url) || news_reg_special.test(news_url)) { console.log('The news link does not conform to the format!'); return; } // Crawl the news page var news_search_sql = 'select url from news where url=?'; var news_search = [news_url]; crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) { if (vals.length > 0) { console.log('The news page has been crawled!') } else { crawler_news_url(news_url); } }); }); }); } // Crawl news links function crawler_news_url(news_url) { request(news_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Define news information json var news = {}; news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS"); news.url = news_url; news.url_encoding = 'UTF-8'; news.keywords = ''; news.title = ''; news.date = new Date(); news.author = ''; news.source = ''; news.summary = ''; news.content = ''; // Get news keywords try { news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")"); } catch (e) { console.log('News keyword acquisition error:' + e); } // Get news headlines try { news.title = eval("$('title').text()").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('News Title acquisition error:' + e); } // Get news time try { news.date = eval("$('#ne_wrap').eq(0).attr(\"data-publishtime\")"); } catch (e) { console.log('News date acquisition error:' + e); } // Get news author try { news.author = eval("$('.icon').eq(0).attr(\"alt\")"); if (news.author == 'netease') { news.author = eval("$('.post_author').text()").replace(/[\r\n\s]/g, "").replace("Source:", ""); var author_reg = /Responsible editor:.+_/; news.author = author_reg.exec(news.author).toString().replace("Responsible editor:", "").replace("_", ""); } } catch (e) { console.log('News author get error:' + e); } // Access to news sources try { news.source = eval("$('.post_info').children(':first').text()").replace(/[\r\n\s]/g, ""); if (news.source == 'report') { news.source = eval("$('.post_info').prop('firstChild').nodeValue").replace(/[\r\n\s]/g, ""); var source_reg = /.+source:/; var tmp = source_reg.exec(news.source).toString(); news.source = news.source.replace(tmp, ""); } } catch (e) { console.log('News source acquisition error:' + e); } // Get news summary try { news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('News summary get error:' + e); } // Get news content try { news.content = eval("$('.post_body').text()").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('News content acquisition error:' + e); } console.log(JSON.stringify(news)); // Write to database if (news.content != '') { var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'; var news_add = [news.url, news.source, news.url_encoding, news.title, news.keywords, news.author, news.date, news.crawler_time, news.summary, news.content ]; crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) { if (qerr) { console.log(qerr); } }); } }); }
crawler_schedule_sina.js
var crawler_request = require('request'); var crawler_iconv = require('iconv-lite'); var crawler_cheerio = require('cheerio'); require('date-utils'); // Connect database var crawler_sql = require("./crawler_sql.js"); // Timed execution var crawler_schedule = require('node-schedule'); var crawler_rule = new crawler_schedule.RecurrenceRule(); // crawler_rule.hour = [0, 12]; // crawler_rule.minute = 5; crawler_rule.second = 0; crawler_schedule.scheduleJob(crawler_rule, function() { crawler(); }); // Crawl the homepage regularly function request(url, callback) { var options = { url: url, encoding: null, headers: { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36' }, timeout: 10000 } crawler_request(options, callback); }; var crawler_url = 'https://news.sina.com.cn/'; function crawler() { request(crawler_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Determine whether there are hyperlinks in the web page var url_hrefs; try { url_hrefs = eval("$('a')"); } catch (e) { console.log('Page does not have hyperlink' + e); } // Traverse all hyperlinks in the web page url_hrefs.each(function(i, e) { // Get news var news_url = ""; try { var url_href = ""; url_href = $(e).attr("href"); if (typeof(url_href) == "undefined") { return true; } if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) { news_url = url_href; } else if (url_href.startsWith('//')) { news_url = 'https:' + url_href; } else { news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href; } } catch (e) { console.log('Error getting news page' + e); } // Verify whether the news web page url conforms to the url naming format var news_reg = /\/(\d{4})-(\d{2})-(\d{2})\/doc-([a-zA-Z0-9]{15}).shtml/; // For example: https://news.sina.com.cn/c/xl/2021-04-29/doc-ikmyaawc2421496.shtml if (!news_reg.test(news_url)) { console.log('The news link does not conform to the format!'); return; } // Crawl the news page var news_search_sql = 'select url from news where url=?'; var news_search = [news_url]; crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) { if (vals.length > 0) { console.log('The news page has been crawled!') } else { crawler_news_url(news_url); } }); }); }); } // Crawl news links function crawler_news_url(news_url) { request(news_url, function(err, res, body) { // Web page parsing try { // Code conversion var url_encoding = 'UTF-8'; var url_html = crawler_iconv.decode(body, url_encoding); //Parsing web pages var $ = crawler_cheerio.load(url_html, { decodeEntities: true }); } catch (e) { console.log('Page decoding error:' + e); } // Define news information json var news = {}; news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS"); news.url = news_url; news.url_encoding = 'UTF-8'; news.keywords = ''; news.title = ''; news.date = new Date(); news.author = ''; news.source = ''; news.summary = ''; news.content = ''; // Get news keywords try { news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")"); } catch (e) { console.log('News keyword acquisition error:' + e); } // Get news headlines try { news.title = eval("$('title').text()").replace(/[\r\n\s]/g, ""); } catch (e) { console.log('News Title acquisition error:' + e); } // Get news time try { news.date = eval("$('.date').text()"); news.date = news.date.replace('year', '-'); news.date = news.date.replace('month', '-'); news.date = news.date.replace('day', ''); } catch (e) { console.log('News date acquisition error:' + e); } // Get news author try { news.author = eval("$('.show_author').text()").replace("Responsible editor:", ""); } catch (e) { console.log('News author get error:' + e); } // Access to news sources try { news.source = eval("$('meta[name=\"mediaid\"]').eq(0).attr(\"content\")"); } catch (e) { console.log('News source acquisition error:' + e); } // Get news summary try { news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, ""); if (news.summary == "") { news.summary = eval("$('meta[property=\"og:description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, ""); } } catch (e) { console.log('News summary get error:' + e); } // Get news content try { news.content = eval("$('.article').text()").replace(/[\r\n\s]/g, ""); if (news.content == "") { news.content = eval("$('#article_content').text()").replace(/[\r\n\s]/g, ""); } } catch (e) { console.log('News content acquisition error:' + e); } console.log(JSON.stringify(news)); // Write to database if (news.author == '' || news.author == null) { news.author = news.source; } if (news.content != '' && news.source != '') { var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'; var news_add = [news.url, news.source, news.url_encoding, news.title, news.keywords, news.author, news.date, news.crawler_time, news.summary, news.content ]; crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) { if (qerr) { console.log(qerr); } }); } }); }