python crawler (crawling pictures)

Posted by ESCForums.com on Mon, 11 Nov 2019 22:55:35 +0100

python crawler picture

Pictures of school flowers

First step

Load crawler module

#Load crawler module
import  re                  #Load crawler module
import  requests               #Load crawler module

The second step

Get the address of xiaohua.com, get everything in its web page F12

#Get the address of xiaohua.com, get everything in its web page F12
import  re                  #Load crawler module
import  requests               #Load crawler module

response = requests.get(f'http://www.xiaohuar.com/list-1-0.html')
data = response.text                #data is what we want in F12, and it is saved in string format

The third step

Get the address of school flower net, the link of picture

#Get the address of school flower net, the link of picture
import  re                  #Load crawler module
import  requests               #Load crawler module

response = requests.get(f'http://www.xiaohuar.com/list-1-0.html')
data = response.text                #data is what we want in F12, and it is saved in string format

one_list = re.findall('" src="(.*?)" /></a>',data) #Where (. *?) indicates what we want to get
for v in range(len(one_list)): #type:str this is to let pychar know that V is a string format and easy to import built-in methods
    if one_list[v].startswith('/d'):         #This is to make some of the protected images the right hyperlink format
        one_list[v] = f'http://www.xiaohuar.com/{one_list[v]}'

The fourth step

Create a file and save it

#Get the address of school flower net, the link of picture
import  re                  #Load crawler module
import  requests               #Load crawler module

response = requests.get(f'http://www.xiaohuar.com/list-1-0.html')
data = response.text                #data is what we want in F12, and it is saved in string format

one_list = re.findall('" src="(.*?)" /></a>',data) #Where (. *?) indicates what we want to get
for v in range(len(one_list)): #type:str this is to let pychar know that V is a string format and easy to import built-in methods
    if one_list[v].startswith('/d'):         #This is to make some of the protected images the right hyperlink format
        one_list[v] = f'http://www.xiaohuar.com/{one_list[v]}'
        
for x in one_list: #type:str
    name = x.split('/')[-1]   #To generate file names automatically
    dd = requests.get(x)      #Get information about pictures
    dd = dd.content           #Picture information installation into mechanical language
    with open(f'D:\picture\{name}','wb') as fw:  #Path to create file, write to save
        fw.write(dd)
        fw.flush()     

The fifth step

Get all the information on the next page of the school flower net and print it every time

import  re                  #Modular
import  requests               #Modular
num = 0                         #In order to record the times of climbing photos
for url_name in range(44):
    response = requests.get(f'http://www.xiaohuar.com/list-1-{url_name}.html')
    data = response.text
    # print(data)
    one_list = re.findall('" src="(.*?)" /></a>',data)
    for v in range(len(one_list)): #type:str
        if one_list[v].startswith('/d'):
            one_list[v] = f'http://www.xiaohuar.com/{one_list[v]}'
    for x in one_list: #type:str
        name = x.split('/')[-1]
        dd = requests.get(x)
        dd = dd.content
        with open(f'D:\picture\{name}','wb') as fw:
            fw.write(dd)
            fw.flush()
            num += 1
            print(f'Crawl{num}Zhang')

Topics: Python