Python build proxy ip pool

Posted by Grimloch on Fri, 21 Jan 2022 12:44:01 +0100

summary

When using crawlers, most websites have certain anti crawling measures. Some websites will limit the access speed or access times of each IP. If you exceed its limit, your IP will be blocked. The processing of access speed is relatively simple. It only needs to crawl once at an interval to avoid frequent access; For the number of visits, you need to use proxy IP to help. Using multiple proxy IP to access the target website in turn can effectively solve the problem.

At present, there are many agency service websites on the Internet to provide agency services and some free agents, but the availability is poor. If the demand is high, you can buy paid agents with good availability.

Therefore, we can build our own proxy pool, obtain proxy IP from various proxy service websites, and test its availability (use a stable website to test, preferably the website we are going to crawl), then save it to the database and call it when necessary.

Free agent website

Manufacturer nameaddress
66 agenthttp://www.66ip.cn/
Western thorn agencyhttps://www.xicidaili.com
Whole network agenthttp://www.goubanjia.com
Cloud agenthttp://www.ip3366.net
IP seahttp://www.iphai.com
Fast agenthttps://www.kuaidaili.com
Free proxy IP libraryhttp://ip.jiangxianli.com
Small fantasy agenthttps://ip.ihuan.me/

The case used this time is Xiaohuan agent

code

Guide Package

import loguru, requests, random, time  # Send requests, log, etc
from lxml import etree  # Analysis data
from concurrent.futures import ThreadPoolExecutor  # Thread pool

url of the web page

Because the url of each page of Xiaohuan agent is irregular, it needs to be obtained one by one

def get_url():  # Get the web page where the ip address is stored
    print("Getting ip pool", ",Don't worry!")
    for i in range(random.randint(10, 20)):  # Crawl random pages
        time.sleep(1)
        if i == 0:
            url = "https://ip.ihuan.me/"
        else:
            url = url_list[-1]
        try:
            resp = requests.get(url=url, headers=headers_test, timeout=10)
        except Exception as e:
            print(e)
            break
        html = etree.HTML(resp.text)
        ul = html.xpath('//ul[@class="pagination"]')
        ul_num = html.xpath('//ul[@class="pagination"]/li')
        for j in range(len(ul_num)):
            if j != 0 and j != len(ul_num) - 1:
                a = ul[0].xpath(f"./li[{j}+1]/a/@href")[0]
                url_list.append("https://ip. ihuan. Me / "+ a) # get many proxy IP addresses
        loguru.logger.info(f"over,{url}")

ip address

def get_ip():
    for i in url_list:
        time.sleep(1)
        resp = requests.get(url=i, headers=headers)
        html = etree.HTML(resp.text)
        td = html.xpath("//tbody/tr")
        for i in td:
            ip = i.xpath("./td[1]//text()")[0] # address
            pt = i.xpath("./td[2]//text()")[0] # port
            tp = "http" if i.xpath("./td[5]//text()")[0] = =" the "else" HTTPS "# access type is not supported
            ip_list.append({"type": tp, "proxy": f"{ip}:{pt}"})
    loguru.logger.info("ip Address acquisition complete")

testing

def test_ip(ip):
    proxy_test = {
        "http": f"{ip}",
        "https": f"{ip}"
        # Note: if the requested IP is of https type, but the proxy IP only supports http, the local IP is still used. If the requested IP is of http type, the proxy IP must be of http and cannot be written as https, otherwise the local IP address is used
    }
    resp = requests.get(url=url_test, headers=headers, proxies=proxy_test, timeout=6)
    if resp.json()["origin"] == ip.split(":")[0]:
        ip = {"type": url.strip(":")[0], "proxy": ip}  # Format ip to facilitate post-processing. Yes, it has http/https identification
        temp_ip.append(ip)  # Add qualified items and discard unqualified items

arrangement

def set_ip(url) -> "Dynamic construction ip pool":  # To pass in the url of the page that needs to be crawled
    try:
        f = open('./app/ip.txt', "r")
        for j in eval(f.read()):
            temp_ip.append(j)
        f.close()
    except Exception as e:
        print("No, ip,Constructing ip Pool, just a moment, please")

    if not temp_ip:  # Determine whether there is an ip address
        print("No, ip Address, getting")
        get_url()
    else:
        for i in temp_ip:
            ip_list.append(i)  # Add the existing ip to the test ip
        temp_ip.clear()

    get_ip()  # Get a lot of ip addresses
    with open('./app/ip.txt', "w") as file:
        file.write(ip_list)
    ip_able = list(set(j["proxy"] for j in ip_list if j["type"] == url.split(":")[0]))  # Store ip strings that meet the requirements, and use the dictionary to remove duplicates
    url_test = "http://httpbin. Org / ip "if URL. Split (": ") [0] =" HTTP "else" "# test whether the ip address is useful

    def test_ip(ip):
        proxy_test = {
            "http": f"{ip}",
            "https": f"{ip}"
            # Note: if the requested IP is of https type, but the proxy IP only supports http, the local IP is still used. If the requested IP is of http type, the proxy IP must be of http and cannot be written as https, otherwise the local IP address is used
        }
        resp = requests.get(url=url_test, headers=headers, proxies=proxy_test, timeout=6)
        if resp.json()["origin"] == ip.split(":")[0]:
            ip = {"type": url.strip(":")[0], "proxy": ip}  # Format ip to facilitate post-processing. Yes, it has http/https identification
            temp_ip.append(ip)  # Add qualified items and discard unqualified items

    with ThreadPoolExecutor(50) as pool:  # Using multithreaded testing
        pool.map(test_ip, ip_able)

    pool.join()

    print("Test complete")

    if temp_ip:
        i = random.choice(temp_ip)
        proxy = {
            "http": f"{i['proxy']}",
            "https": f"{i['proxy']}"
        }
        return proxy
    else:
        set_ip(url=url)

Necessary parameters

# parameter

headers = {
    'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
headers_test = {
    'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36",
    "accept-encoding": "gzip, deflate, br",
    "cookie": "Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1642389014,1642412091",
    "Referer": "https://ip.ihuan.me/"
}
url_list, ip_list, temp_ip = ["https://ip.ihuan.me / "], [], [] # store url, IP address and useful IP address

Total code

import loguru, requests, random, time
from lxml import etree
from concurrent.futures import ThreadPoolExecutor


def get_url():  # Get the web page where the ip address is stored
    print("Getting ip pool", ",Don't worry!")
    for i in range(random.randint(10, 20)):  # Crawl random pages
        time.sleep(1)
        if i == 0:
            url = "https://ip.ihuan.me/"
        else:
            url = url_list[-1]
        try:
            resp = requests.get(url=url, headers=headers_test, timeout=10)
        except Exception as e:
            print(e)
            break
        html = etree.HTML(resp.text)
        ul = html.xpath('//ul[@class="pagination"]')
        ul_num = html.xpath('//ul[@class="pagination"]/li')
        for j in range(len(ul_num)):
            if j != 0 and j != len(ul_num) - 1:
                a = ul[0].xpath(f"./li[{j}+1]/a/@href")[0]
                url_list.append("https://ip. ihuan. Me / "+ a) # get many proxy IP addresses
        loguru.logger.info(f"over,{url}")


def get_ip():
    for i in url_list:
        time.sleep(1)
        resp = requests.get(url=i, headers=headers)
        html = etree.HTML(resp.text)
        td = html.xpath("//tbody/tr")
        for i in td:
            ip = i.xpath("./td[1]//text()")[0] # address
            pt = i.xpath("./td[2]//text()")[0] # port
            tp = "http" if i.xpath("./td[5]//text()")[0] = =" the "else" HTTPS "# access type is not supported
            ip_list.append({"type": tp, "proxy": f"{ip}:{pt}"})
    loguru.logger.info("ip Address acquisition complete")


def set_ip(url) -> "Dynamic construction ip pool":  # To pass in the url of the page that needs to be crawled
    try:
        f = open('./app/ip.txt', "r")
        for j in eval(f.read()):
            temp_ip.append(j)
        f.close()
    except Exception as e:
        print("No, ip,Constructing ip Pool, just a moment, please")

    if not temp_ip:  # Determine whether there is an ip address
        print("No, ip Address, getting")
        get_url()
    else:
        for i in temp_ip:
            ip_list.append(i)  # Add the existing ip to the test ip
        temp_ip.clear()

    get_ip()  # Get a lot of ip addresses
    with open('./app/ip.txt', "w") as file:
        file.write(ip_list)
    ip_able = list(set(j["proxy"] for j in ip_list if j["type"] == url.split(":")[0]))  # Store ip strings that meet the requirements, and use the collection to remove duplicates
    url_test = "http://httpbin. Org / ip "if URL. Split (": ") [0] =" HTTP "else" "# test whether the ip address is useful

    def test_ip(ip):
        proxy_test = {
            "http": f"{ip}",
            "https": f"{ip}"
            # Note: if the requested IP is of https type, but the proxy IP only supports http, the local IP is still used. If the requested IP is of http type, the proxy IP must be of http and cannot be written as https, otherwise the local IP address is used
        }
        resp = requests.get(url=url_test, headers=headers, proxies=proxy_test, timeout=6)
        if resp.json()["origin"] == ip.split(":")[0]:
            ip = {"type": url.strip(":")[0], "proxy": ip}  # Format ip to facilitate post-processing. Yes, it has http/https identification
            temp_ip.append(ip)  # Add qualified items and discard unqualified items

    with ThreadPoolExecutor(50) as pool:  # Using multithreaded testing
        pool.map(test_ip, ip_able)

    pool.join()

    print("Test complete")

    if temp_ip:
        i = random.choice(temp_ip)
        proxy = {
            "http": f"{i['proxy']}",
            "https": f"{i['proxy']}"
        }
        return proxy
    else:
        set_ip(url=url)


# parameter

headers = {
    'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
}
headers_test = {
    'User-Agent': "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 96.0.4664 .93 Safari / 537.36",
    "accept-encoding": "gzip, deflate, br",
    "cookie": "Hm_lvt_8ccd0ef22095c2eebfe4cd6187dea829=1642389014,1642412091",
    "Referer": "https://ip.ihuan.me/"
}
url_list, ip_list, temp_ip = ["https://ip.ihuan.me / "], [], [] # store url, IP address and useful IP address

if __name__ == '__main__':
    proxy = set_ip(url="https://www.baidu.com ") # get proxy ip
    print(proxy)

summary

If the database is installed, you can use the ip stored in the database. The code uses local files to store data. At the same time, try to avoid the local ip being blocked

Topics: Python https TCP/IP