Node.js to realize the function of web news crawler and search

Posted by canobi on Fri, 18 Feb 2022 11:33:58 +0100

Node. News crawler and network search (JS)

Project requirements

1, Reptile part
1. Complete the web page analysis and crawler design of the target website.
2. Crawl no less than 100 pieces of data (each data includes 7 fields, news keywords, news titles, news dates, news authors, news sources, news abstracts and news contents), and store them in the database.

2, Search site section
1. Complete the search function of crawling news content and title in the database, and the search results are displayed in the front page in tabular form.
2. Complete the time heat analysis of the search content, and use the table to display the number of Search contents contained in the crawled data content every day.

This paper is the third part of the project: rewrite the crawler as timed crawling

2, Timed crawler rewrite

After this series of articles (1) and (2), we have two nodes JS crawler, because news websites constantly push new news, we need to set a timing function to call the crawler at each specified time to update the crawled news.

1. Review existing codes

crawler_163.js

var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');

// Connect database
var crawler_sql = require("./crawler_sql.js");
// var crawler_sql = require("mysql");
// var pool = crawler_sql.createPool({
//     host: '127.0.0.1',
//     user: 'root',
//     password: 'root',
//     database: 'crawl'
// });
// var query = function(sql, sqlparam, callback) {
//     pool.getConnection(function(err, conn) {
//         if (err) {
//             callback(err, null, null);
//         } else {
//             conn.query(sql, sqlparam, function(qerr, vals, fields) {
//                 conn.release(); // Release connection 
//                 callback(qerr, vals, fields); // Event driven callback 
//             });
//         }
//     });
// };
// var query_noparam = function(sql, callback) {
//     pool.getConnection(function(err, conn) {
//         if (err) {
//             callback(err, null, null);
//         } else {
//             conn.query(sql, function(qerr, vals, fields) {
//                 conn.release(); // Release connection 
//                 callback(qerr, vals, fields); // Event driven callback 
//             });
//         }
//     });
// };
// exports.query = query;
// exports.query_noparam = query_noparam;

// Crawl the homepage
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        headers: {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
        },
        timeout: 10000
    }
    crawler_request(options, callback);
};

var crawler_url = 'https://news.163.com/';
crawler();

function crawler() {
    request(crawler_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Determine whether there are hyperlinks in the web page
        var url_hrefs;
        try {
            url_hrefs = eval("$('a')");
        } catch (e) {
            console.log('Page does not have hyperlink' + e);
        }
        // Traverse all hyperlinks in the web page
        url_hrefs.each(function(i, e) {
            // Get news
            var news_url = "";
            try {
                var url_href = "";
                url_href = $(e).attr("href");
                if (typeof(url_href) == "undefined") {
                    return true;
                }
                if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
                    news_url = url_href;
                } else if (url_href.startsWith('//')) {
                    news_url = 'https:' + url_href;
                } else {
                    news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
                }
            } catch (e) {
                console.log('Error getting news page' + e);
            }
            // Verify whether the news web page url conforms to the url naming format
            var news_reg = /\/news\/article\/([a-zA-Z0-9]{16}).html/;
            var news_reg_special = /\/news\/article\/([a-zA-Z0-9]{8})0001982T.html/;
            // For example: https://www.163.com/news/article/G8HQOAKE0001899O.html
            if (!news_reg.test(news_url) || news_reg_special.test(news_url)) {
                console.log('The news link does not conform to the format!');
                return;
            }
            // Crawl the news page
            var news_search_sql = 'select url from news where url=?';
            var news_search = [news_url];
            crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
                if (vals.length > 0) {
                    console.log('The news page has been crawled!')
                } else {
                    crawler_news_url(news_url);
                }
            });
        });
    });
}

// Crawl news links
function crawler_news_url(news_url) {
    request(news_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Define news information json
        var news = {};
        news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
        news.url = news_url;
        news.url_encoding = 'UTF-8';
        news.keywords = '';
        news.title = '';
        news.date = new Date();
        news.author = '';
        news.source = '';
        news.summary = '';
        news.content = '';
        // Get news keywords
        try {
            news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
        } catch (e) {
            console.log('News keyword acquisition error:' + e);
        }
        // Get news headlines
        try {
            news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('News Title acquisition error:' + e);
        }
        // Get news time
        try {
            news.date = eval("$('#ne_wrap').eq(0).attr(\"data-publishtime\")");
        } catch (e) {
            console.log('News date acquisition error:' + e);
        }
        // Get news author
        try {
            news.author = eval("$('.icon').eq(0).attr(\"alt\")");
            if (news.author == 'netease') {
                news.author = eval("$('.post_author').text()").replace(/[\r\n\s]/g, "").replace("Source:", "");
                var author_reg = /Responsible editor:.+_/;
                news.author  = author_reg.exec(news.author).toString().replace("Responsible editor:", "").replace("_", "");
            }
        } catch (e) {
            console.log('News author get error:' + e);
        }
        // Access to news sources
        try {
            news.source = eval("$('.post_info').children(':first').text()").replace(/[\r\n\s]/g, "");
            if (news.source == 'report') {
                news.source = eval("$('.post_info').prop('firstChild').nodeValue").replace(/[\r\n\s]/g, "");
                var source_reg = /.+source:/;
                var tmp = source_reg.exec(news.source).toString();
                news.source = news.source.replace(tmp, "");
            }
        } catch (e) {
            console.log('News source acquisition error:' + e);
        }
        // Get news summary
        try {
            news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('News summary get error:' + e);
        }
        // Get news content
        try {
            news.content = eval("$('.post_body').text()").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('News content acquisition error:' + e);
        }
        console.log(JSON.stringify(news));

        // Write to database
        if (news.content != '') {
            var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
            var news_add = [news.url, news.source, news.url_encoding,
                news.title, news.keywords, news.author, news.date,
                news.crawler_time, news.summary, news.content
            ];
            crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
                if (qerr) {
                    console.log(qerr);
                }
            });
        }
    });
}

crawler_sina.js

var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');

// Connect database
var crawler_sql = require("./crawler_sql.js");

// Crawl the homepage
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        headers: {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
        },
        timeout: 10000
    }
    crawler_request(options, callback);
};

var crawler_url = 'https://news.sina.com.cn/';
crawler();

function crawler() {
    request(crawler_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Determine whether there are hyperlinks in the web page
        var url_hrefs;
        try {
            url_hrefs = eval("$('a')");
        } catch (e) {
            console.log('Page does not have hyperlink' + e);
        }
        // Traverse all hyperlinks in the web page
        url_hrefs.each(function(i, e) {
            // Get news
            var news_url = "";
            try {
                var url_href = "";
                url_href = $(e).attr("href");
                if (typeof(url_href) == "undefined") {
                    return true;
                }
                if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
                    news_url = url_href;
                } else if (url_href.startsWith('//')) {
                    news_url = 'https:' + url_href;
                } else {
                    news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
                }
            } catch (e) {
                console.log('Error getting news page' + e);
            }
            // Verify whether the news web page url conforms to the url naming format
            var news_reg = /\/(\d{4})-(\d{2})-(\d{2})\/doc-([a-zA-Z0-9]{15}).shtml/;
            // For example: https://news.sina.com.cn/c/xl/2021-04-29/doc-ikmyaawc2421496.shtml
            if (!news_reg.test(news_url)) {
                console.log('The news link does not conform to the format!');
                return;
            }
            // Crawl the news page
            var news_search_sql = 'select url from news where url=?';
            var news_search = [news_url];
            crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
                if (vals.length > 0) {
                    console.log('The news page has been crawled!')
                } else {
                    crawler_news_url(news_url);
                }
            });
        });
    });
};

// Crawl news links
function crawler_news_url(news_url) {
    request(news_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Define news information json
        var news = {};
        news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
        news.url = news_url;
        news.url_encoding = 'UTF-8';
        news.keywords = '';
        news.title = '';
        news.date = new Date();
        news.author = '';
        news.source = '';
        news.summary = '';
        news.content = '';
        // Get news keywords
        try {
            news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
        } catch (e) {
            console.log('News keyword acquisition error:' + e);
        }
        // Get news headlines
        try {
            news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('Get news title error:' + e);
        }
        // Get news time
        try {
            news.date = eval("$('.date').text()");
            news.date = news.date.replace('year', '-');
            news.date = news.date.replace('month', '-');
            news.date = news.date.replace('day', '');
        } catch (e) {
            console.log('News date acquisition error:' + e);
        }
        // Get news author
        try {
            news.author = eval("$('.show_author').text()").replace("Responsible editor:", "");
        } catch (e) {
            console.log('News author get error:' + e);
        }
        // Access to news sources
        try {
            news.source = eval("$('meta[name=\"mediaid\"]').eq(0).attr(\"content\")");
        } catch (e) {
            console.log('News source acquisition error:' + e);
        }
        // Get news summary
        try {
            news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
            if (news.summary == "") {
                news.summary = eval("$('meta[property=\"og:description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
            }
        } catch (e) {
            console.log('News summary get error:' + e);
        }
        // Get news content
        try {
            news.content = eval("$('.article').text()").replace(/[\r\n\s]/g, "");
            if (news.content == "") {
                news.content = eval("$('#article_content').text()").replace(/[\r\n\s]/g, "");
            }
        } catch (e) {
            console.log('News content acquisition error:' + e);
        }
        console.log(JSON.stringify(news));

        // Write to database
        if (news.author == '' || news.author == null) {
            news.author = news.source;
        }
        if (news.content != '' && news.source != '') {
            var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
            var news_add = [news.url, news.source, news.url_encoding,
                news.title, news.keywords, news.author, news.date,
                news.crawler_time, news.summary, news.content
            ];
            crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
                if (qerr) {
                    console.log(qerr);
                }
            });
        }
    });
}

2. Introduce related packages

New crawler_schedule_163.js and crawler_schedule_sina.js file.

These two files are pairs of corresponding crawlers_ 163.js and crawler_sina.js files are rewritten in the same way

Introduce the required related package node schedule:

// Timed execution
var crawler_schedule = require('node-schedule');

Establish timing rules:

var crawler_rule = new crawler_schedule.RecurrenceRule();
// crawler_rule.hour = [0, 12];
// crawler_rule.minute = 5;
crawler_rule.second = 0;

Of which The hour method is to call the crawler at the set time. In this case, [0,12] means that the crawler is executed at 0 and 12 The minute method calls the crawler for the set number of minutes. In this case, 5 refers to the fifth minute of each hour The second method calls the crawler for the set number of seconds. In this case, 0 means that the crawler starts at the 0th second of each minute.
Comment out the main function that originally called the crawler:

// crawler();

Replace with:

crawler_schedule.scheduleJob(crawler_rule, function() {
    crawler();
});

3. Timing crawler code

crawler_schedule_163.js

var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');

// Connect database
var crawler_sql = require("./crawler_sql.js");

// Timed execution
var crawler_schedule = require('node-schedule');
var crawler_rule = new crawler_schedule.RecurrenceRule();
// crawler_rule.hour = [0, 12];
// crawler_rule.minute = 5;
crawler_rule.second = 0;
crawler_schedule.scheduleJob(crawler_rule, function() {
    crawler();
});

// Crawl the homepage regularly
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        headers: {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
        },
        timeout: 10000
    }
    crawler_request(options, callback);
};

var crawler_url = 'https://news.163.com/';

function crawler() {
    request(crawler_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Determine whether there are hyperlinks in the web page
        var url_hrefs;
        try {
            url_hrefs = eval("$('a')");
        } catch (e) {
            console.log('Page does not have hyperlink' + e);
        }
        // Traverse all hyperlinks in the web page
        url_hrefs.each(function(i, e) {
            // Get news
            var news_url = "";
            try {
                var url_href = "";
                url_href = $(e).attr("href");
                if (typeof(url_href) == "undefined") {
                    return true;
                }
                if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
                    news_url = url_href;
                } else if (url_href.startsWith('//')) {
                    news_url = 'https:' + url_href;
                } else {
                    news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
                }
            } catch (e) {
                console.log('Error getting news page' + e);
            }
            // Verify whether the news web page url conforms to the url naming format
            var news_reg = /\/news\/article\/([a-zA-Z0-9]{16}).html/;
            var news_reg_special = /\/news\/article\/([a-zA-Z0-9]{8})0001982T.html/
            // For example: https://www.163.com/news/article/G8HQOAKE0001899O.html
            if (!news_reg.test(news_url) || news_reg_special.test(news_url)) {
                console.log('The news link does not conform to the format!');
                return;
            }
            // Crawl the news page
            var news_search_sql = 'select url from news where url=?';
            var news_search = [news_url];
            crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
                if (vals.length > 0) {
                    console.log('The news page has been crawled!')
                } else {
                    crawler_news_url(news_url);
                }
            });
        });
    });
}

// Crawl news links
function crawler_news_url(news_url) {
    request(news_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Define news information json
        var news = {};
        news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
        news.url = news_url;
        news.url_encoding = 'UTF-8';
        news.keywords = '';
        news.title = '';
        news.date = new Date();
        news.author = '';
        news.source = '';
        news.summary = '';
        news.content = '';
        // Get news keywords
        try {
            news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
        } catch (e) {
            console.log('News keyword acquisition error:' + e);
        }
        // Get news headlines
        try {
            news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('News Title acquisition error:' + e);
        }
        // Get news time
        try {
            news.date = eval("$('#ne_wrap').eq(0).attr(\"data-publishtime\")");
        } catch (e) {
            console.log('News date acquisition error:' + e);
        }
        // Get news author
        try {
            news.author = eval("$('.icon').eq(0).attr(\"alt\")");
            if (news.author == 'netease') {
                news.author = eval("$('.post_author').text()").replace(/[\r\n\s]/g, "").replace("Source:", "");
                var author_reg = /Responsible editor:.+_/;
                news.author  = author_reg.exec(news.author).toString().replace("Responsible editor:", "").replace("_", "");
            }
        } catch (e) {
            console.log('News author get error:' + e);
        }
        // Access to news sources
        try {
            news.source = eval("$('.post_info').children(':first').text()").replace(/[\r\n\s]/g, "");
            if (news.source == 'report') {
                news.source = eval("$('.post_info').prop('firstChild').nodeValue").replace(/[\r\n\s]/g, "");
                var source_reg = /.+source:/;
                var tmp = source_reg.exec(news.source).toString();
                news.source = news.source.replace(tmp, "");
            }
        } catch (e) {
            console.log('News source acquisition error:' + e);
        }
        // Get news summary
        try {
            news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('News summary get error:' + e);
        }
        // Get news content
        try {
            news.content = eval("$('.post_body').text()").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('News content acquisition error:' + e);
        }
        console.log(JSON.stringify(news));

        // Write to database
        if (news.content != '') {
            var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
            var news_add = [news.url, news.source, news.url_encoding,
                news.title, news.keywords, news.author, news.date,
                news.crawler_time, news.summary, news.content
            ];
            crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
                if (qerr) {
                    console.log(qerr);
                }
            });
        }
    });
}

crawler_schedule_sina.js

var crawler_request = require('request');
var crawler_iconv = require('iconv-lite');
var crawler_cheerio = require('cheerio');
require('date-utils');

// Connect database
var crawler_sql = require("./crawler_sql.js");

// Timed execution
var crawler_schedule = require('node-schedule');
var crawler_rule = new crawler_schedule.RecurrenceRule();
// crawler_rule.hour = [0, 12];
// crawler_rule.minute = 5;
crawler_rule.second = 0;
crawler_schedule.scheduleJob(crawler_rule, function() {
    crawler();
});

// Crawl the homepage regularly
function request(url, callback) {
    var options = {
        url: url,
        encoding: null,
        headers: {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
        },
        timeout: 10000
    }
    crawler_request(options, callback);
};

var crawler_url = 'https://news.sina.com.cn/';

function crawler() {
    request(crawler_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Determine whether there are hyperlinks in the web page
        var url_hrefs;
        try {
            url_hrefs = eval("$('a')");
        } catch (e) {
            console.log('Page does not have hyperlink' + e);
        }
        // Traverse all hyperlinks in the web page
        url_hrefs.each(function(i, e) {
            // Get news
            var news_url = "";
            try {
                var url_href = "";
                url_href = $(e).attr("href");
                if (typeof(url_href) == "undefined") {
                    return true;
                }
                if (url_href.toLowerCase().indexOf('http://') >= 0 || url_href.toLowerCase().indexOf('https://') >= 0) {
                    news_url = url_href;
                } else if (url_href.startsWith('//')) {
                    news_url = 'https:' + url_href;
                } else {
                    news_url = crawler_url.substr(0, crawler_url.lastIndexOf('/') + 1) + url_href;
                }
            } catch (e) {
                console.log('Error getting news page' + e);
            }
            // Verify whether the news web page url conforms to the url naming format
            var news_reg = /\/(\d{4})-(\d{2})-(\d{2})\/doc-([a-zA-Z0-9]{15}).shtml/;
            // For example: https://news.sina.com.cn/c/xl/2021-04-29/doc-ikmyaawc2421496.shtml
            if (!news_reg.test(news_url)) {
                console.log('The news link does not conform to the format!');
                return;
            }
            // Crawl the news page
            var news_search_sql = 'select url from news where url=?';
            var news_search = [news_url];
            crawler_sql.query(news_search_sql, news_search, function(qerr, vals, fields) {
                if (vals.length > 0) {
                    console.log('The news page has been crawled!')
                } else {
                    crawler_news_url(news_url);
                }
            });
        });
    });
}

// Crawl news links
function crawler_news_url(news_url) {
    request(news_url, function(err, res, body) {
        // Web page parsing
        try {
            // Code conversion
            var url_encoding = 'UTF-8';
            var url_html = crawler_iconv.decode(body, url_encoding);
            //Parsing web pages
            var $ = crawler_cheerio.load(url_html, { decodeEntities: true });
        } catch (e) {
            console.log('Page decoding error:' + e);
        }
        // Define news information json
        var news = {};
        news.crawler_time = (new Date()).toFormat("YYYY-MM-DD HH:MM:SS.SSSS");
        news.url = news_url;
        news.url_encoding = 'UTF-8';
        news.keywords = '';
        news.title = '';
        news.date = new Date();
        news.author = '';
        news.source = '';
        news.summary = '';
        news.content = '';
        // Get news keywords
        try {
            news.keywords = eval("$('meta[name=\"keywords\"]').eq(0).attr(\"content\")");
        } catch (e) {
            console.log('News keyword acquisition error:' + e);
        }
        // Get news headlines
        try {
            news.title = eval("$('title').text()").replace(/[\r\n\s]/g, "");
        } catch (e) {
            console.log('News Title acquisition error:' + e);
        }
        // Get news time
        try {
            news.date = eval("$('.date').text()");
            news.date = news.date.replace('year', '-');
            news.date = news.date.replace('month', '-');
            news.date = news.date.replace('day', '');
        } catch (e) {
            console.log('News date acquisition error:' + e);
        }
        // Get news author
        try {
            news.author = eval("$('.show_author').text()").replace("Responsible editor:", "");
        } catch (e) {
            console.log('News author get error:' + e);
        }
        // Access to news sources
        try {
            news.source = eval("$('meta[name=\"mediaid\"]').eq(0).attr(\"content\")");
        } catch (e) {
            console.log('News source acquisition error:' + e);
        }
        // Get news summary
        try {
            news.summary = eval("$('meta[name=\"description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
            if (news.summary == "") {
                news.summary = eval("$('meta[property=\"og:description\"]').eq(0).attr(\"content\")").replace(/[\r\n\s]/g, "");
            }
        } catch (e) {
            console.log('News summary get error:' + e);
        }
        // Get news content
        try {
            news.content = eval("$('.article').text()").replace(/[\r\n\s]/g, "");
            if (news.content == "") {
                news.content = eval("$('#article_content').text()").replace(/[\r\n\s]/g, "");
            }
        } catch (e) {
            console.log('News content acquisition error:' + e);
        }
        console.log(JSON.stringify(news));

        // Write to database
        if (news.author == '' || news.author == null) {
            news.author = news.source;
        }
        if (news.content != '' && news.source != '') {
            var news_add_sql = 'INSERT INTO news(url, source, url_encoding, title, keywords, author, date, crawler_time, summary, content) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)';
            var news_add = [news.url, news.source, news.url_encoding,
                news.title, news.keywords, news.author, news.date,
                news.crawler_time, news.summary, news.content
            ];
            crawler_sql.query(news_add_sql, news_add, function(qerr, vals, fields) {
                if (qerr) {
                    console.log(qerr);
                }
            });
        }
    });
}

Topics: Javascript node.js Front-end MySQL crawler