Actual combat crawling back copy cartoon - Python

Posted by redtux on Thu, 23 Dec 2021 15:05:39 +0100

I Capture packets to get links

Take crawling "ex con" as an example

Get search links

https://api. copymanga. com/api/v3/search/comic? Limit = 5 & Q = criminal record

Get comic detail page

https://api.copymanga.com/api/v3/comic/qiankezhe/group/default/chapters?limit=200

First words link

https://api.copymanga.com/api/v3/comic/qiankezhe/chapter2/52f9d522-0d71-11eb-93ea-00163e0ca5bd

In the process of capturing packets, because different headers have different responses, you need to insert a sentence

headers = {"Uesr-Agent" : "Dart/2.10 (dart:io)", "region" : "1"}

Then, first reference the requests module and write out the request function and create directory function

#request
def request_get(url):
    try:
        headers = {"Uesr-Agent" : "Dart/2.10 (dart:io)", "region" : "1"}
        request_str = requests.get(url,headers=headers)
        request_str.encoding = 'utf-8-sig'
        return request_str
    except:
        print("Access failed, please check the network")
        sys.exit()
#Create directory
def mkdir(path):
    folder = os.path.exists(path)
    if not folder:                   
    	os.makedirs(path)            
    else:
        print (path + "directory already exists")

II Processing search links and their parameters

Use the packet capture software to get the search link

https://api. copymanga. com/api/v3/search/comic? Limit = 5 & Q = criminal record

The parameter limit is the number of cartoon search results displayed. This is fuzzy search, and the parameter q is the search content

Since it is a fuzzy search, of course, you should manually search the app and rank the comics you want

Let's first change the limit parameter to 1, so that the link will only return one cartoon to facilitate observation of the response:

{
    "code": 200,
    "message": "Request succeeded",
    "results": {
        "list": [
            {
                "cover": "https://mirror2.mangafunc.fun/comic/qiankezhe/cover/3c1a6b4c-0d6b-11eb-b49c-00163e0ca5bd.jpg!kb_m_item",
                "img_type": 2,
                "author": [
                    {
                        "name": "Moon Island winter 2",
                        "alias": null,
                        "path_word": "yuedaodonger"
                    },
                    {
                        "name": "Xiangchuanまさひと",
                        "alias": null,
                        "path_word": "xiangchuanirihm"
                    }
                ],
                "name": "Criminal record",
                "alias": "Criminal record,Criminal record",
                "path_word": "qiankezhe",
                "popular": 13042
            }
        ],
        "total": 1816,
        "limit": 1,
        "offset": 0
    }
}

Observe qiankezhe and path in the link of comic detail page_ Word corresponds, comic name corresponds to name

Then, based on this fuzzy search, we directly use the json module to obtain these two parameters and store them in name_words and names in the dictionary and corresponding to the order of search results on the, code:

try:
    name = input("Please enter cartoon name:")
    th = int(input("Please enter the ranking position (number) of cartoon search results:"))
except:
    print("Input error")
    sys.exit()

search_url = "https://api.copymanga.com/api/v3/search/comic?limit=10&q={}".format(name)
search_str = request_get(search_url)
name_str = json.loads(search_str.text).get("results").get("list")
name_words = {}
names = {}
num = 1
for i in name_str:
    name_words.update({num : i.get("path_word")})
    names.update({num : i.get("name")})
    num += 1

III Processing comic detail pages and their parameters

https://api.copymanga.com/api/v3/comic/qiankezhe/group/default/chapters?limit=200

Look at the link. limit obviously shows how many words, and the qiankezhe in the middle is the name above_ Words dictionary

Then change the limit to 1 and take a look at the response (remember to format it with json to make the response look better):

{
	"code": 200,
	"message": "\u8bf7\u6c42\u6210\u529f",
	"results": {
		"list": [{
			"index": 0,
			"uuid": "52f9d522-0d71-11eb-93ea-00163e0ca5bd",
			"count": 28,
			"ordered": 10,
			"size": 41,
			"name": "First words",
			"comic_id": "3c192200-0d6b-11eb-b49c-00163e0ca5bd",
			"comic_path_word": "qiankezhe",
			"group_id": null,
			"group_path_word": "default",
			"type": 1,
			"img_type": 2,
			"datetime_created": "2020-10-14",
			"prev": null,
			"next": "cffb84a2-15d8-11eb-9e11-00163e0ca5bd"
		}],
		"total": 28,
		"limit": 1,
		"offset": 0
	}
}

See that the uuid inside corresponds to the first conversation link 52f9d522-0d71-11eb-93ea-00163e0ca5bd

https://api.copymanga.com/api/v3/comic/qiankezhe/chapter2/52f9d522-0d71-11eb-93ea-00163e0ca5bd

Then we create a directory of this ramble, then obtain uuid and name (the name is the number of words), and then create a 'number of words' directory to store pictures:

#Create comic catalog
name_path = path + r'\{}'.format(names[th])
mkdir(name_path)
#Get the uuid of each call
uuid_url = "https://api.copymanga.com/api/v3/comic/{}/group/default/chapters?limit=200".format(name_words[th])
uuid_str = request_get(uuid_url)
uuid_str = json.loads(uuid_str.text).get("results").get("list")
for i in uuid_str:
    chaper = i.get("name")
    uuid = i.get("uuid")
    chaper_path = name_path + r"\{}".format(chaper)
    mkdir(chaper_path) #Create the 'first few words' directory
    down(uuid) #Add the down function to download pictures (the down function is not written yet)

IV Write the down function to download pictures

Let's take a look at the link in the first sentence

https://api.copymanga.com/api/v3/comic/qiankezhe/chapter2/52f9d522-0d71-11eb-93ea-00163e0ca5bd

Qiankezhen and 52f9d522-0d71-11eb-93ea-00163e0ca5bd It has been obtained above. They are name_words dictionary and string uuid

Look at the response (the contents are too long, so I'll delete some):

{
	"code": 200,
	"message": "\u8bf7\u6c42\u6210\u529f",
	"results": {
		"show_app": false,
		"is_lock": false,
		"is_login": false,
		"is_mobile_bind": false,
		"is_vip": false,
		"comic": {
			"name": "\u524d\u79d1\u8005",
			"uuid": "3c192200-0d6b-11eb-b49c-00163e0ca5bd",
			"path_word": "qiankezhe",
			"restrict": {
				"value": 0,
				"display": "\u4e00\u822c\u5411(\u514d\u8cbb)"
			}
		},
		"chapter": {
			"index": 0,
			"uuid": "52f9d522-0d71-11eb-93ea-00163e0ca5bd",
			"count": 28,
			"ordered": 10,
			"size": 41,
			"name": "\u7b2c01\u8bdd",
			"comic_id": "3c192200-0d6b-11eb-b49c-00163e0ca5bd",
			"comic_path_word": "qiankezhe",
			"group_id": null,
			"group_path_word": "default",
			"type": 1,
			"img_type": 2,
			"datetime_created": "2020-10-14",
			"prev": null,
			"next": "cffb84a2-15d8-11eb-9e11-00163e0ca5bd",
			"contents": [{
				"uuid": "8aff0712-0d71-11eb-9be4-00163e0ca5bd",
				"url": "https://mirror2.mangafunc.fun/comic/qiankezhe/420b9/8afbbff8-0d71-11eb-9be4-00163e0ca5bd.jpg!kb_m_read_large"
			}],
			"words": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 30, 38, 39, 21, 31, 27, 32, 34, 23, 26, 28, 24, 40, 36, 20, 25, 33, 29, 22, 35, 37],
			"is_long": false
		}
	}
}

You can see that the url in the contents is the link to each image, so just get the link directly and download it locally

Look at the code:

def down(uuid):
    get_pictrue_url = "https://api.copymanga.com/api/v3/comic/{}/chapter2/{}".format(name_words,uuid)
    picture_str = request_get(get_pictrue_url)
    picture_str = json.loads(picture_str.text).get("results").get("chapter").get("contents")
    num = 1
    for i in picture_str:
        picture = request_get(i.get("url"))
        with open(chaper_path + "\{}.jpg".format(num),"wb") as code:
            code.write(picture.content)
        num = num + 1

ending:

In this way, we refer to four libraries: requests, Sys, OS and JSON

Integrate all codes:

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import requests
import sys
import os
import json

#############################Configuration area##############################
#Save directory
path = r"D:\0file\Download"
#################################################################

#################################################################
############################Code area###############################
try:
    name = input("Please enter cartoon name:")
    th = int(input("Please enter the ranking position (number) of cartoon search results:"))
except:
    print("Input error")
    sys.exit()

#--------------—Function area————————————————————————————#
#Create directory
def mkdir(path):
    folder = os.path.exists(path)
    if not folder:                   
    	os.makedirs(path)            
    else:
	    print (path + "directory already exists")
#request
def request_get(url):
    try:
        headers = {"Uesr-Agent" : "Dart/2.10 (dart:io)", "region" : "1"}
        request_str = requests.get(url,headers=headers)
        request_str.encoding = 'utf-8-sig'
        return request_str
    except:
        print("Access failed, please check the network")
        sys.exit()
#Next picture
def down(uuid):
    get_pictrue_url = "https://api.copymanga.com/api/v3/comic/{}/chapter2/{}".format(name_words,uuid)
    picture_str = request_get(get_pictrue_url)
    picture_str = json.loads(picture_str.text).get("results").get("chapter").get("contents")
    num = 1
    for i in picture_str:
        picture = request_get(i.get("url"))
        with open(chaper_path + "\{}.jpg".format(num),"wb") as code:
            code.write(picture.content)
        num = num + 1
#-------------------------------#

#Get search results
search_url = "https://api.copymanga.com/api/v3/search/comic?limit=10&q={}".format(name)
search_str = request_get(search_url)
name_str = json.loads(search_str.text).get("results").get("list")
name_words = {}
names = {}
num = 1
for i in name_str:
    name_words.update({num : i.get("path_word")})
    names.update({num : i.get("name")})
    num += 1

#Create comic catalog
name_path = path + r'\{}'.format(names[th])
mkdir(name_path)
#Get the uuid of each call
uuid_url = "https://api.copymanga.com/api/v3/comic/{}/group/default/chapters?limit=200".format(name_words[th])
uuid_str = request_get(uuid_url)
uuid_str = json.loads(uuid_str.text).get("results").get("list")
for i in uuid_str:
    chaper = i.get("name")
    uuid = i.get("uuid")
    chaper_path = name_path + r"\{}".format(chaper)
    mkdir(chaper_path)
    down(uuid)
#################################################################
#################################################################

Topics: Python

Programmer Think