Solution to the failure of using crawler agent to collect websites

Posted by RootKit on Thu, 20 Jan 2022 14:49:29 +0100

It is well known that crawler collection websites must use dynamic agents to avoid frequent restrictions on website access. However, in the process of collecting websites, even if dynamic agents are used, anti crawling errors of 403, 503 or 429 will still occur. Why? According to past experience, it is generally caused by the following reasons:

1. Modification of dynamic user agent

When the crawler collects websites, the normal HTTP requests need to be optimized by ua (user agent), because ua is the browser ID. if the HTTP request does not have ua, or even some crawlers actively mark it as collection, the target website is very likely to refuse collection

2. Controls the frequency of requests for a single proxy IP

Although the crawler program uses dynamic agents, if the multithreading control of the program is not well implemented, a single agent IP will send a large number of requests in a short time, resulting in frequent access to the IP

3. IP validity time management

During the use of dynamic proxy IP, the survival check must be carried out. Once the proxy IP with high delay and low bandwidth is found, it should be actively discarded to avoid timeout during use

If you think the above work is too troublesome, it is recommended to use the enhanced version of the crawler agent with automatic forwarding. This product can automatically allocate different proxy IP forwarding for each http request, and carry out automatic multi-threaded management of the IP pool, ensuring that the request connectivity rate is more than 99% and the delay is less than 300ms. You can quickly start collecting websites. The following is the product demo, which can be copied and used directly, Run by configuring agent parameters (proxyHost, proxyPort, proxyUser, proxyPass) and target website (targetUrl):

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.net.URI;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import org.apache.http.Header;
import org.apache.http.HeaderElement;
import org.apache.http.HttpHost;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.AuthCache;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.ProxyAuthenticationStrategy;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.NameValuePair;
import org.apache.http.util.EntityUtils;

public class Demo
{
    // Proxy server (product official website www.16yun.cn)
    final static String proxyHost = "t.16yun.cn";
    final static Integer proxyPort = 31000;

    // Proxy authentication information
    final static String proxyUser = "username";
    final static String proxyPass = "password";


    private static PoolingHttpClientConnectionManager cm = null;
    private static HttpRequestRetryHandler httpRequestRetryHandler = null;
    private static HttpHost proxy = null;

    private static CredentialsProvider credsProvider = null;
    private static RequestConfig reqConfig = null;

    static {
        ConnectionSocketFactory plainsf = PlainConnectionSocketFactory.getSocketFactory();
        LayeredConnectionSocketFactory sslsf = SSLConnectionSocketFactory.getSocketFactory();

        Registry registry = RegistryBuilder.create()
            .register("http", plainsf)
            .register("https", sslsf)
            .build();

        cm = new PoolingHttpClientConnectionManager(registry);
        cm.setMaxTotal(20);
        cm.setDefaultMaxPerRoute(5);

        proxy = new HttpHost(proxyHost, proxyPort, "http");

        credsProvider = new BasicCredentialsProvider();
        credsProvider.setCredentials(AuthScope.ANY, new UsernamePasswordCredentials(proxyUser, proxyPass));

        reqConfig = RequestConfig.custom()
            .setConnectionRequestTimeout(5000)
            .setConnectTimeout(5000)
            .setSocketTimeout(5000)
            .setExpectContinueEnabled(false)
            .setProxy(new HttpHost(proxyHost, proxyPort))
            .build();
    }

    public static void doRequest(HttpRequestBase httpReq) {
        CloseableHttpResponse httpResp = null;

        try {
            setHeaders(httpReq);

            httpReq.setConfig(reqConfig);

            CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionManager(cm)
                .setDefaultCredentialsProvider(credsProvider)
                .build();

            AuthCache authCache = new BasicAuthCache();
            authCache.put(proxy, new BasicScheme());

            HttpClientContext localContext = HttpClientContext.create();
            localContext.setAuthCache(authCache);

            httpResp = httpClient.execute(httpReq, localContext);

            int statusCode = httpResp.getStatusLine().getStatusCode();

            System.out.println(statusCode);

            BufferedReader rd = new BufferedReader(new InputStreamReader(httpResp.getEntity().getContent()));

            String line = "";
            while((line = rd.readLine()) != null) {
                System.out.println(line);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (httpResp != null) {
                    httpResp.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * Set request header
     *
     * @param httpReq
     */
    private static void setHeaders(HttpRequestBase httpReq) {

        // Setting up proxy tunnel
        // Random random = new Random();
        // int tunnel = random.nextInt(10000);
        // httpReq.setHeader("Proxy-Tunnel", String.valueOf(tunnel));

        httpReq.setHeader("Accept-Encoding", null);

    }

    public static void doGetRequest() {
        // Target page to access
        String targetUrl = "https://httpbin.org/ip";


        try {
            HttpGet httpGet = new HttpGet(targetUrl);

            doRequest(httpGet);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        doGetRequest();

    }
}

Topics: Python