User Agent Pool
User agent pool is a pool of different user agents, which is then called randomly.
Function: Each visit represents a different browser
import urllib.request import re import random uapools=[ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12', ] def ua(uapools): thisua=random.choice(uapools) print(thisua) headers=("User-Agent",thisua) opener=urllib.request.build_opener() opener.addheaders=[headers] urllib.request.install_opener(opener) for i in range(10): ua(uapools) thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/"; data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore") pat='<div class="content">.*?<span>(.*?)</span>.*?</div>' res=re.compile(pat,re.S).findall(data) for j in range(len(res)): print(res[j]) print('---------------------')
Two schemes for constructing IP proxy pool and IP proxy pool
Search for Spur and Elephant Agent IP
Choose foreign IP as far as possible.
import urllib.request ip="219.131.240.35" proxy=urllib.request.ProxyHandler({"http":ip}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) url="https://www.baidu.com/" data=urllib.request.urlopen(url).read() fp=open("ip_baidu.html","wb") fp.write(data) fp.close()
The first way to build an IP proxy pool (suitable for proxy IP stability)
import random import urllib.request ippools=[ "163.125.70.22", "111.231.90.122", "121.69.37.6", ] def ip(ippools): thisip=random.choice(ippools) print(thisip) proxy=urllib.request.ProxyHandler({"http":thisip}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) for i in range(5): try: ip(ippools) url="https://www.baidu.com/" data=urllib.request.urlopen(url).read().decode("utf-8","ignore") print(len(data)) fp=open("ip_res/ip_baidu_"+str(i+1)+".html","w") fp.write(data) fp.close() except Exception as err: print(err)
The second way to build IP proxy pool (interface invocation method, more suitable for proxy IP instability)
This method is temporarily pigeonholed for economic reasons.
Taobao commodity picture crawler
Now Taobao anti-crawler, the following code can not climb, but can be used as an exercise.
import urllib.request import re import random keyname="python" key=urllib.request.quote(keyname) #Chinese is not allowed on the website. Chinese is handled here. uapools=[ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12', ] def ua(uapools): thisua=random.choice(uapools) print(thisua) headers=("User-Agent",thisua) opener=urllib.request.build_opener() opener.addheaders=[headers] urllib.request.install_opener(opener) for i in range(1,11): #Pages 1 to 10 ua(uapools) url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44) data=urllib.request.urlopen(url).read().decode("UTF-8","ignore") pat='pic_url":"//(.*?)"' imglist=re.compile(pat).findall(data) print(len(imglist)) for j in range(len(imglist)): thisimg=imglist[j] thisimgurl="https://"+thisimg localfile="Taobao Pictures/"+str(i)+str(j)+".jpg" urllib.request.urlretrieve(thisimgurl,localfile)
Use both user agent pool and IP agent pool
Encapsulated as a function:
import urllib.request import re import random uapools=[ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12', ] ippools=[ "163.125.70.22", "111.231.90.122", "121.69.37.6", ] def ua_ip(myurl): def ip(ippools,uapools): thisip=random.choice(ippools) print(thisip) thisua = random.choice(uapools) print(thisua) headers = ("User-Agent", thisua) proxy=urllib.request.ProxyHandler({"http":thisip}) opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler) opener.addheaders = [headers] urllib.request.install_opener(opener) for i in range(5): try: ip(ippools,uapools) url=myurl data=urllib.request.urlopen(url).read().decode("utf-8","ignore") print(len(data)) break except Exception as err: print(err) return data data=ua_ip("https://www.baidu.com/") fp=open("uaip.html","w",encoding="utf-8") fp.write(data) fp.close()
Encapsulated into modules:
Copy modules to python directory
Use:
from uaip import * data=ua_ip("https://www.baidu.com/") fp=open("baidu.html","w",encoding="utf-8") fp.write(data) fp.close()
Analysis of grabbing bags
Fiddler tool: As a proxy server, request and response go through fiddler
Select Firefox Browser and set up the network:
Setting up HTTPS Protocol: Open fiddler's Tool Options, and tick
Then click Actions to select and import to the desktop.
Back to Firefox
Import Certificates on Desktop
Common command clear: clear screen
Automatically Asynchronous Ajax Request Data
For example, microblog, the data is loaded when it is dragged below, not synchronized. Another example is "click to load more", which is asynchronous and requires package analysis.
Look at the chestnut below.
Tencent Video Review (Deep Interpretation) Reptilian Practice
Open Tencent videos in Firefox, such as https://v.qq.com/x/cover/j6cgzhtkuonf6te.html
Click to see more explanations, and fiddler will have a js file:
The content is commentary.
Find a comment and transcode it:
Check out ctrl+f in Firefox for this comment.
url of copy js file.
Click to view more comments and trigger a json, copy url
Two URLs are analyzed:
Simplify the web page. https://video.coral.qq.com/filmreviewr/c/upcomment/j6cgzhtkuonf6te?reqnum=3&commentid=6227734628246412645
Through analysis, we can know that j6cg... Video id, reqnum is the number of comments per view, commentid is the comment ID
https://video.coral.qq.com/filmreviewr/c/upcomment/[vid]?reqnum=[num]&commentid=[cid]
- One-page comment crawler
Some special characters, such as pictures, are not yet known how to deal with them. Let's discuss it later
import urllib.request import re from uaip import * vid="j6cgzhtkuonf6te" cid="6227734628246412645" num="3" #Three extracts per page url="https://video.coral.qq.com/filmreviewr/c/upcomment/"+vid+"?reqnum="+num+"&commentid="+cid data=ua_ip(url) titlepat='"title":"(.*?)","abstract":"' commentpat='"content":"(.*?)",' titleall=re.compile(titlepat,re.S).findall(data) commentall=re.compile(commentpat,re.S).findall(data) # print(len(commentall)) for i in range(len(titleall)): try: print("The title of the commentary is:"+eval("u'"+titleall[i]+"'")) print("The comments are as follows:"+eval("u'"+commentall[i]+"'")) print('---------------') except Exception as err: print(err)
-
Page-turning Comment Crawler
Looking at the source code of the Web page, you can see last: The following is the id of the next page.import urllib.request import re from uaip import * vid="j6cgzhtkuonf6te" cid="6227734628246412645" num="3" for j in range(10): #Crawl 1 to 10 pages of content print("The first"+str(j+1)+"page") url = "https://video.coral.qq.com/filmreviewr/c/upcomment/" + vid + "?reqnum=" + num + "&commentid=" + cid data = ua_ip(url) titlepat = '"title":"(.*?)","abstract":"' commentpat = '"content":"(.*?)",' titleall = re.compile(titlepat, re.S).findall(data) commentall = re.compile(commentpat, re.S).findall(data) lastpat='"last":"(.*?)"' cid=re.compile(lastpat,re.S).findall(data)[0] for i in range(len(titleall)): try: print("The title of the commentary is:" + eval("u'" + titleall[i] + "'")) print("The comments are as follows:" + eval("u'" + commentall[i] + "'")) print('---------------') except Exception as err: print(err)
For short reviews (general reviews) which are similar, let's not dwell on it here. Look at the following short reviews crawler code:
Simplified to: https://video.coral.qq.com/varticle/1743283224/comment/v2?orinum=10&oriorder=o&pageflag=1&cursor=6442954225602101929
import urllib.request import re from uaip import * vid="1743283224" cid="6442954225602101929" num="5" for j in range(10): #Crawl 1 to 10 pages of content print("The first"+str(j+1)+"page") url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?orinum="+num+"&oriorder=o&pageflag=1&cursor="+cid data = ua_ip(url) commentpat = '"content":"(.*?)"' commentall = re.compile(commentpat, re.S).findall(data) lastpat='"last":"(.*?)"' cid=re.compile(lastpat,re.S).findall(data)[0] # print(len(gg)) # print(len(commentall)) for i in range(len(commentall)): try: print("The comments are as follows:" + eval("u'" + commentall[i] + "'")) print('---------------') except Exception as err: print(err)