Nowadays most websites load the content of pages with the sliding of scrollbars, so it is impossible to get all the content of pages simply by getting Html of static pages. Using Selenium, you can simulate a browser pulling a slider to load all page content.
previously on
Introduction to Selenium
Selenium is a WEB automated testing tool. Selenium tests run directly in browsers, just like real users do. Support browsers include IE (7, 8, 9, 10, 11), Mozilla Firefox, Safari, Google Chrome, Opera, etc. The main functions include: testing browser compatibility - testing your application to see if it works well on different browsers and operating systems. Testing System Functions - Create regression testing software functions and user requirements. Support automatic recording and braking action and automatic generation of test scripts in different languages such as. Net, Java, Perl. Selenium is also an open source framework that also uses Apache License 2.0.
C # Install Selenium
This article just uses Selenium to pull the scrollbar, so we don't introduce Selenium too much.
Search "Selenium" through Nuget package manager and install:
- Selenium.WebDriver
- Selenium.Chrome.WebDriver
Example (Get all the pictures on the home page of a website)
General Access Web page Html
ChromeDriver driver = new ChromeDriver(); driver.Navigate().GoToUrl(url); string title = driver.Title;//Page title string html = driver.PageSource;//Page Html
Do not start the Chrome window and close the Chrome console to get web pages
When the program executes, it automatically opens the Chrome window and outputs some information from the console, which we don't need.
//Do not start the chrome window ChromeOptions options = new ChromeOptions(); options.AddArgument("headless"); //Close the Chrome Driver console ChromeDriverService driverService = ChromeDriverService.CreateDefaultService(); driverService.HideCommandPromptWindow = true; ChromeDriver driver = new ChromeDriver(driverService, options); driver.Navigate().GoToUrl(url);
Scroll the page to the bottom
If scrollTo(0, document.body.scrollHeight) is used, scrolling the page directly to the bottom will result in failure of reading the middle part of the page, so it needs several slides and enough time to load the page.
for (int i = 1; i <= 10; i++) { string jsCode = "window.scrollTo({top: document.body.scrollHeight / 10 * " + i + ", behavior: \"smooth\"});"; //Running js code using the IJavaScript Executor interface IJavaScriptExecutor js = (IJavaScriptExecutor)driver; js.ExecuteScript(jsCode); //Pause rolling Thread.Sleep(1000); }
Resolve read Html using HtmlAgilityPack
The following contents and Last article Basically the same
string title = driver.Title;//Page title string html = driver.PageSource;//Page Html HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html);//Resolving Html strings string imgPath = "//Img ";//Select img" //Get the picture in the img tag foreach (HtmlNode node in doc.DocumentNode.SelectNodes(imgPath)) { ······ }
Complete code
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Net; using System.IO; using HtmlAgilityPack; using System.Text.RegularExpressions; using OpenQA.Selenium; using OpenQA.Selenium.Chrome; using System.Threading; namespace WebCrawlerDemo { class Program { static void Main(string[] args) { WebClient wc = new WebClient(); int imgNum = 0;//Picture Number string url = "https://www.bilibili.com"; string html = FinalHtml.GetFinalHtml(url, 10); HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(html); string imgPath = "//Img ";//Select img" //HtmlNode nodes = hd.DocumentNode.SelectSingleNode(path); //Get the picture in the img tag foreach (HtmlNode node in doc.DocumentNode.SelectNodes(imgPath)) { if (node.Attributes["src"] != null) { string imgUrl = node.Attributes["src"].Value.ToString(); if (imgUrl != "" && imgUrl != " ") { imgNum++; //Generate filenames, automatically retrieve suffixes string fileName = GetImgName(imgUrl, imgNum); //Console.WriteLine(fileName); //Console.WriteLine(imgUrl); ImgDownloader.DownloadImg(wc, imgUrl, "images/", fileName); } } } //Getting Background Map string bgImgPath = "//* [@style] ";// Select nodes with style attributes foreach (HtmlNode node in doc.DocumentNode.SelectNodes(bgImgPath)) { if (node.Attributes["style"].Value.Contains("background-image:url")) { imgNum++; string bgImgUrl = node.Attributes["style"].Value; bgImgUrl = Regex.Match(bgImgUrl, @"(?<=\().+?(?=\))").Value;//Read the contents of url() //Console.WriteLine(bgImgUrl); //Generate filenames, automatically retrieve suffixes string fileName = GetImgName(bgImgUrl, imgNum); ImgDownloader.DownloadImg(wc, bgImgUrl, "images/bgcImg/", fileName); } } Console.WriteLine("----------END----------"); Console.WriteLine($"Total acquisition: {imgNum}Zhang map"); Console.ReadKey(); } } /// <summary> /// Picture Downloader /// </summary> public class ImgDownloader { /// <summary> /// Download pictures /// </summary> /// <param name="webClient"></param> /// <param name="url">picture url</param> /// <param name="folder Path">folder path </param> /// <param name="fileName">picture name</param> public static void DownloadImg(WebClient webClient, string url, string folderPath, string fileName) { //If the folder does not exist, create one if (!Directory.Exists(folderPath)) { Directory.CreateDirectory(folderPath); } //Determine whether the path is complete or not, and make up for the incomplete path if (url.IndexOf("https:") == -1 && url.IndexOf("http:") == -1) { url = "https:" + url; } //Download pictures try { webClient.DownloadFile(url, folderPath + fileName); Console.WriteLine(fileName + "Download Successful"); } catch (Exception ex) { Console.Write(ex.Message); Console.WriteLine(url); } } /// <summary> /// Generate image names /// </summary> /// <param name="imageUrl">picture address </param> /// <param name="imageNum">picture number </param> /// <returns></returns> public static string GetImgName(string imageUrl, int imageNum) { string imgExtension; if (imageUrl.LastIndexOf(".") != -1) { imgExtension = imageUrl.Substring(imageUrl.LastIndexOf(".")); } else { imgExtension = ".jpg"; } return imageNum + imgExtension; } } /// <summary> /// Get the site where js has been executed /// </summary> public class FinalHtml { /// <summary> /// Get the page after pulling the scrollbar /// </summary> /// <param name="url">web site </param> /// <param name="sectionNum">scroll several times </param> /// <returns>html string </returns> public static string GetFinalHtml(string url, int sectionNum) { //Do not start the chrome window ChromeOptions options = new ChromeOptions(); options.AddArgument("headless"); //Close the Chrome Driver console ChromeDriverService driverService = ChromeDriverService.CreateDefaultService(); driverService.HideCommandPromptWindow = true; ChromeDriver driver = new ChromeDriver(driverService, options); driver.Navigate().GoToUrl(url); string title = driver.Title; Console.WriteLine($"Title: {title}"); //Scroll the page to the bottom Console.Write("Page scrolling, please wait a moment"); for (int i = 1; i <= sectionNum; i++) { string jsCode = "window.scrollTo({top: document.body.scrollHeight / " + sectionNum + " * " + i + ", behavior: \"smooth\"});"; IJavaScriptExecutor js = (IJavaScriptExecutor)driver; js.ExecuteScript(jsCode); Console.Write("."); Thread.Sleep(1000); } Console.WriteLine(); string html = driver.PageSource; driver.Quit(); return html; } } }