python crawler: user and IP proxy pool, packet capture analysis, asynchronous request data, Tencent video comment crawler

Posted by Kevmaster on Wed, 21 Aug 2019 07:41:07 +0200

User Agent Pool

User agent pool is a pool of different user agents, which is then called randomly.

Function: Each visit represents a different browser

import urllib.request
import re
import random
uapools=[
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
]
def ua(uapools):
    thisua=random.choice(uapools)
    print(thisua)
    headers=("User-Agent",thisua)
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    urllib.request.install_opener(opener)

for i in range(10):
    ua(uapools)
    thisurl="https://www.qiushibaike.com/text/page/"+str(i+1)+"/";
    data=urllib.request.urlopen(thisurl).read().decode("utf-8","ignore")
    pat='<div class="content">.*?<span>(.*?)</span>.*?</div>'
    res=re.compile(pat,re.S).findall(data)
    for j in range(len(res)):
        print(res[j])
        print('---------------------')

Two schemes for constructing IP proxy pool and IP proxy pool

Search for Spur and Elephant Agent IP

Choose foreign IP as far as possible.

import  urllib.request
ip="219.131.240.35"
proxy=urllib.request.ProxyHandler({"http":ip})
opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
url="https://www.baidu.com/"
data=urllib.request.urlopen(url).read()
fp=open("ip_baidu.html","wb")
fp.write(data)
fp.close()

The first way to build an IP proxy pool (suitable for proxy IP stability)

import random
import urllib.request
ippools=[
    "163.125.70.22",
    "111.231.90.122",
    "121.69.37.6",
]

def ip(ippools):
    thisip=random.choice(ippools)
    print(thisip)
    proxy=urllib.request.ProxyHandler({"http":thisip})
    opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)

for i in range(5):
    try:
        ip(ippools)
        url="https://www.baidu.com/"
        data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
        print(len(data))
        fp=open("ip_res/ip_baidu_"+str(i+1)+".html","w")
        fp.write(data)
        fp.close()
    except Exception as err:
        print(err)

The second way to build IP proxy pool (interface invocation method, more suitable for proxy IP instability)

This method is temporarily pigeonholed for economic reasons.

Taobao commodity picture crawler

Now Taobao anti-crawler, the following code can not climb, but can be used as an exercise.

import urllib.request
import re
import random
keyname="python"
key=urllib.request.quote(keyname) #Chinese is not allowed on the website. Chinese is handled here.
uapools=[
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
]
def ua(uapools):
    thisua=random.choice(uapools)
    print(thisua)
    headers=("User-Agent",thisua)
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    urllib.request.install_opener(opener)

for i in range(1,11): #Pages 1 to 10
    ua(uapools)
    url="https://s.taobao.com/search?q="+key+"&s="+str((i-1)*44)
    data=urllib.request.urlopen(url).read().decode("UTF-8","ignore")
    pat='pic_url":"//(.*?)"'
    imglist=re.compile(pat).findall(data)
    print(len(imglist))
    for j in range(len(imglist)):
        thisimg=imglist[j]
        thisimgurl="https://"+thisimg
        localfile="Taobao Pictures/"+str(i)+str(j)+".jpg"
        urllib.request.urlretrieve(thisimgurl,localfile)

Use both user agent pool and IP agent pool

Encapsulated as a function:

import urllib.request
import re
import random
uapools=[
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
]
ippools=[
    "163.125.70.22",
    "111.231.90.122",
    "121.69.37.6",
]
def ua_ip(myurl):
    def ip(ippools,uapools):
        thisip=random.choice(ippools)
        print(thisip)
        thisua = random.choice(uapools)
        print(thisua)
        headers = ("User-Agent", thisua)
        proxy=urllib.request.ProxyHandler({"http":thisip})
        opener=urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
        opener.addheaders = [headers]
        urllib.request.install_opener(opener)

    for i in range(5):
        try:
            ip(ippools,uapools)
            url=myurl
            data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
            print(len(data))
            break
        except Exception as err:
            print(err)
    return data
data=ua_ip("https://www.baidu.com/")
fp=open("uaip.html","w",encoding="utf-8")
fp.write(data)
fp.close()

Encapsulated into modules:

Copy modules to python directory

Use:

from uaip import *
data=ua_ip("https://www.baidu.com/")
fp=open("baidu.html","w",encoding="utf-8")
fp.write(data)
fp.close()

Analysis of grabbing bags

Fiddler tool: As a proxy server, request and response go through fiddler

Select Firefox Browser and set up the network:

Setting up HTTPS Protocol: Open fiddler's Tool Options, and tick

Then click Actions to select and import to the desktop.

Back to Firefox

Import Certificates on Desktop

Common command clear: clear screen

Automatically Asynchronous Ajax Request Data

For example, microblog, the data is loaded when it is dragged below, not synchronized. Another example is "click to load more", which is asynchronous and requires package analysis.

Look at the chestnut below.

Tencent Video Review (Deep Interpretation) Reptilian Practice

Open Tencent videos in Firefox, such as https://v.qq.com/x/cover/j6cgzhtkuonf6te.html

Click to see more explanations, and fiddler will have a js file:

The content is commentary.

Find a comment and transcode it:

Check out ctrl+f in Firefox for this comment.

url of copy js file.

Click to view more comments and trigger a json, copy url

Two URLs are analyzed:

Simplify the web page. https://video.coral.qq.com/filmreviewr/c/upcomment/j6cgzhtkuonf6te?reqnum=3&commentid=6227734628246412645

Through analysis, we can know that j6cg... Video id, reqnum is the number of comments per view, commentid is the comment ID

https://video.coral.qq.com/filmreviewr/c/upcomment/[vid]?reqnum=[num]&commentid=[cid]

One-page comment crawler
Some special characters, such as pictures, are not yet known how to deal with them. Let's discuss it later

import urllib.request
import re
from uaip import *
vid="j6cgzhtkuonf6te"
cid="6227734628246412645"
num="3" #Three extracts per page
url="https://video.coral.qq.com/filmreviewr/c/upcomment/"+vid+"?reqnum="+num+"&commentid="+cid
data=ua_ip(url)
titlepat='"title":"(.*?)","abstract":"'
commentpat='"content":"(.*?)",'
titleall=re.compile(titlepat,re.S).findall(data)
commentall=re.compile(commentpat,re.S).findall(data)
# print(len(commentall))
for i in range(len(titleall)):
    try:
        print("The title of the commentary is:"+eval("u'"+titleall[i]+"'"))
        print("The comments are as follows:"+eval("u'"+commentall[i]+"'"))
        print('---------------')
    except Exception as err:
        print(err)

Page-turning Comment Crawler
Looking at the source code of the Web page, you can see last: The following is the id of the next page.

import urllib.request
import re
from uaip import *
vid="j6cgzhtkuonf6te"
cid="6227734628246412645"
num="3"
for j in range(10): #Crawl 1 to 10 pages of content
    print("The first"+str(j+1)+"page")
    url = "https://video.coral.qq.com/filmreviewr/c/upcomment/" + vid + "?reqnum=" + num + "&commentid=" + cid
    data = ua_ip(url)
    titlepat = '"title":"(.*?)","abstract":"'
    commentpat = '"content":"(.*?)",'
    titleall = re.compile(titlepat, re.S).findall(data)
    commentall = re.compile(commentpat, re.S).findall(data)
    lastpat='"last":"(.*?)"'
    cid=re.compile(lastpat,re.S).findall(data)[0]
    for i in range(len(titleall)):
        try:
            print("The title of the commentary is:" + eval("u'" + titleall[i] + "'"))
            print("The comments are as follows:" + eval("u'" + commentall[i] + "'"))
            print('---------------')
        except Exception as err:
            print(err)

For short reviews (general reviews) which are similar, let's not dwell on it here. Look at the following short reviews crawler code:

take https://video.coral.qq.com/varticle/1743283224/comment/v2?callback=_varticle1743283224commentv2&orinum=10&oriorder=o&pageflag=1&cursor=6442954225602101929&scorecursor=0&orirepnum=2&reporder=o&reppageflag=1&source=132&_=1566363507957

Simplified to: https://video.coral.qq.com/varticle/1743283224/comment/v2?orinum=10&oriorder=o&pageflag=1&cursor=6442954225602101929

import urllib.request
import re
from uaip import *
vid="1743283224"
cid="6442954225602101929"
num="5"
for j in range(10): #Crawl 1 to 10 pages of content
    print("The first"+str(j+1)+"page")
    url="https://video.coral.qq.com/varticle/"+vid+"/comment/v2?orinum="+num+"&oriorder=o&pageflag=1&cursor="+cid
    data = ua_ip(url)
    commentpat = '"content":"(.*?)"'
    commentall = re.compile(commentpat, re.S).findall(data)
    lastpat='"last":"(.*?)"'
    cid=re.compile(lastpat,re.S).findall(data)[0]
    # print(len(gg))
    # print(len(commentall))
    for i in range(len(commentall)):
        try:
            print("The comments are as follows:" + eval("u'" + commentall[i] + "'"))
            print('---------------')
        except Exception as err:
            print(err)

Topics: Python Windows Firefox encoding

Programmer Think