Suppose you have installed bs4 requests
This novel is chosen at will, so you don't have to worry too much
Python 3 implementation. There are too many crawlers using Python 2 on the Internet, but there are still few crawlers using Python 3
- The link is https://www.qu.la/book/12763/10664294.html
- Get the title of the article (find the exact html code location)
- If you haven't seen step 1, you can click the link below to see Step 1
- If you haven't seen step 1, you can click the link below to see Step 1
Click to view step 1 - If you haven't seen step 2, you can click the link below to see Step 2 first
[Click to view step 2]
(https://www.jianshu.com/p/5d48b28adaf3)
Step 3: integrate article writing
- Basic realization of novel crawling
- Simulation of rest time by random number to avoid being banned
- Disadvantages: you must first use a link to start the article (this link doesn't matter. If you want to read it anyway, you need to read the linked article as the starting point)
import requests import time import random from bs4 import BeautifulSoup begin_url = "https://www.qu.la/book/12763/10664294.html" base = begin_url[:begin_url.rindex('/')+1] urls = [begin_url] # Initialize url pool first = True for url in urls: req = requests.get(url) req.encoding = 'utf-8' soup = BeautifulSoup(req.text, 'html.parser') try: content = soup.find(id='content') title = soup.find(attrs={"class": "bookname"}) title = title.find('h1').text except: break string = content.text.replace('\u3000', '').replace('\t', '').replace('\n', '').replace('\r', '').replace( '『', '"') .replace('』', '"').replace('\ufffd', '') # Remove irrelevant characters string = string.split('\xa0') # Coding problem solving string = list(filter(lambda x: x, string)) for i in range(len(string)): string[i] = ' ' + string[i] if "Important notice of this website" in string[i]: # Remove last note t = string[i].index('Important notice of this website') string[i] = string[i][:t] string = '\n'.join(string) string = '\n' + title + '\n' + string if first: first = False with open('E:/Code/Python/Project/txtGet/1.txt', 'w') as f: f.write(string) else: with open('E:/Code/Python/Project/txtGet/1.txt', 'a') as f: f.write(string) print(title+' Write complete') next_ = soup.find(attrs={"class": "next"}) next_url = base + next_['href'] urls.append(next_url) time.sleep(random.randint(1, 5)) # Don't visit too fast. Don't visit too rigidly for fear of being banned