python crawler picture
Pictures of school flowers
First step
Load crawler module
#Load crawler module import re #Load crawler module import requests #Load crawler module
The second step
Get the address of xiaohua.com, get everything in its web page F12
#Get the address of xiaohua.com, get everything in its web page F12 import re #Load crawler module import requests #Load crawler module response = requests.get(f'http://www.xiaohuar.com/list-1-0.html') data = response.text #data is what we want in F12, and it is saved in string format
The third step
Get the address of school flower net, the link of picture
#Get the address of school flower net, the link of picture import re #Load crawler module import requests #Load crawler module response = requests.get(f'http://www.xiaohuar.com/list-1-0.html') data = response.text #data is what we want in F12, and it is saved in string format one_list = re.findall('" src="(.*?)" /></a>',data) #Where (. *?) indicates what we want to get for v in range(len(one_list)): #type:str this is to let pychar know that V is a string format and easy to import built-in methods if one_list[v].startswith('/d'): #This is to make some of the protected images the right hyperlink format one_list[v] = f'http://www.xiaohuar.com/{one_list[v]}'
The fourth step
Create a file and save it
#Get the address of school flower net, the link of picture import re #Load crawler module import requests #Load crawler module response = requests.get(f'http://www.xiaohuar.com/list-1-0.html') data = response.text #data is what we want in F12, and it is saved in string format one_list = re.findall('" src="(.*?)" /></a>',data) #Where (. *?) indicates what we want to get for v in range(len(one_list)): #type:str this is to let pychar know that V is a string format and easy to import built-in methods if one_list[v].startswith('/d'): #This is to make some of the protected images the right hyperlink format one_list[v] = f'http://www.xiaohuar.com/{one_list[v]}' for x in one_list: #type:str name = x.split('/')[-1] #To generate file names automatically dd = requests.get(x) #Get information about pictures dd = dd.content #Picture information installation into mechanical language with open(f'D:\picture\{name}','wb') as fw: #Path to create file, write to save fw.write(dd) fw.flush()
The fifth step
Get all the information on the next page of the school flower net and print it every time
import re #Modular import requests #Modular num = 0 #In order to record the times of climbing photos for url_name in range(44): response = requests.get(f'http://www.xiaohuar.com/list-1-{url_name}.html') data = response.text # print(data) one_list = re.findall('" src="(.*?)" /></a>',data) for v in range(len(one_list)): #type:str if one_list[v].startswith('/d'): one_list[v] = f'http://www.xiaohuar.com/{one_list[v]}' for x in one_list: #type:str name = x.split('/')[-1] dd = requests.get(x) dd = dd.content with open(f'D:\picture\{name}','wb') as fw: fw.write(dd) fw.flush() num += 1 print(f'Crawl{num}Zhang')