catalogue
1.2.1 writing regular expressions:
1.3.1 the captured animation playback link is not complete
2.GUI displays crawler content
2.3.1 query by title - fuzzy query
Crawl source link: https://m.agefans.org/rank
target
Crawl out all the information about the ranking, name and popularity of 1 ~ 100 animation in the ranking list, store the information in sqlite3 database, and finally show the crawl out results with GUI.
1. Crawler code
import requests import re import sqlite3 def get_page(url):#Return the HTML source code of the url response = requests.get(url) if response.status_code==200: return response.text else: return 'fail' def parse_html(html):#HTML source code for parsing parameters #Regular expression matching animation ranking, playing link, name, popularity value pattern=re.compile('<li class="row mb-1 rank-item">.*?rank.*?>(\d+)</span>.*?href="(.*?)".*?age-text-blue small.*?>(.*?)</span>.*?col-3 small text-truncate.*?>(.*?)</span>', re.S) result=re.findall(pattern, html) return result def link_html():#Grab source html html = get_page('https://m.agefans.org/rank') result1 = parse_html(html) return result1 def save_db():#Store the crawled data in sqlite result1=link_html() con=sqlite3.connect(r'...\AGE.db') con.execute("""DROP TABLE data""") con.execute("create table data (rank primary key,link,title,rating)") cur=con.cursor() cur.executemany("insert into data(rank,link,title,rating) values(?,?,?,?)",result1) con.commit() cur.close() con.close() if __name__ == '__main__': save_db()
1.1 operation results
Use DB Browser for SQLite to view age DB crawling content (display 1 ~ 35 messages)
1.2 reptile difficulties
1.2.1 writing regular expressions:
Ranking:. *? rank.*?> (\d+)</span>
Link:. *? href="(.*?)"
Name:. *? age-text-blue small.*?> (.*?)</ span>
Heat: *? col-3 small text-truncate.*?> (.*?)</ span>
The composition of page elements corresponding to each website is different. You need to write the corresponding regular expression according to the website you actually need to crawl. CTRL+SHIFT+I enter the inspection page to view the elements.
1.3 deficiencies in reptiles
1.3.1 the captured animation playback link is not complete
Link link because the href in the < a > tag of the website is not an absolute path but a relative path, the crawled link can not jump directly, and there is no way to replace the relative path with an absolute path
2.GUI displays crawler content
import tkinter import tkinter.messagebox from tkinter.messagebox import * import tkinter.ttk import tkinter as tk import sqlite3 from PIL import ImageTk, Image from tkinter import ttk import pymysql win=tkinter.Tk() #Page size win.geometry("1390x750") win.title('AGE Ranking List') #title label=tkinter.Label(win,compound = 'center',text='AGE Animation ranking',font=('Blackbody',40),fg='#db7093',bg='#add8e6',width='500') label.pack() #Background picture imgpath = (r'...\1.jpg')#Background picture path img = Image.open(imgpath) canvas = tk.Canvas(win, width=2500, height=1000, bd=0) photo = ImageTk.PhotoImage(img) canvas.create_image(690, 280, image=photo) canvas.pack() from tkinter import * Label(win, text="Keyword query:",bg='#add8e6',font = (' bold ', 15)) place(x=500, y=80, width=120, height=25) selecttitle = StringVar() Entry(win, textvariable=selecttitle).place(x=650, y=80, width=300, height=25) # Database location database = (r'...\AGE.db') # Display function def showAllInfo(): # Delete previously displayed content x = dataTreeview.get_children() for item in x: dataTreeview.delete(item) # Connect to database con = sqlite3.connect(database) cur = con.cursor() cur.execute("select * from data") lst = cur.fetchall() for item in lst: dataTreeview.insert("", 100, text="line1", values=item) cur.close() con.close() #Query by title def showTitle(): if selecttitle.get() == "": showerror(title='Tips', message='Input cannot be empty') else: x = dataTreeview.get_children() for item in x: dataTreeview.delete(item) con = sqlite3.connect(database) cur = con.cursor() content="'%"+selecttitle.get()+"%'" #Fuzzy query cur.execute("select * from data where title like "+content) lst = cur.fetchall() if len(lst) == 0: #Judge that if the query cannot be found, it will prompt that the window cannot be found showerror(title='Tips', message='This animation is not on the list, or check whether the input information is correct') else:#Otherwise, the query several records window is displayed showinfo(title='Tips', message='Found'+str(len(lst))+"Data bar") for item in lst: dataTreeview.insert("", 100, text="line1", values=item) cur.close() con.close() tkinter.Button(win,text='Query all',width=40,command=showAllInfo,font=(12)).place(x=800, y=125, width=120, height=30) Button(win, text="Query by title", command=showTitle,font=(12)).place(x=550, y=125, width=120, height=30) #List sqlite data dataTreeview = ttk.Treeview(win, show='headings', column=('rank','link', 'title', 'rating')) dataTreeview.column('rank', width=2, anchor="center") dataTreeview.column('link', width=20, anchor="center") dataTreeview.column('title', width=350, anchor="center") dataTreeview.column('rating', width=15, anchor="center") dataTreeview.heading('rank', text='ranking') dataTreeview.heading('link', text='link') dataTreeview.heading('title', text='name') dataTreeview.heading('rating', text='degree of heat') dataTreeview.place(x=200, y=180, width=1000, height=300) #scroll bar s = tkinter.Scrollbar(dataTreeview, command=dataTreeview.yview) s.pack(side="right", fill="y") dataTreeview.config(yscrollcommand=s.set) win.mainloop()
2.1 ideas
According to the database obtained from the previously crawled data, the GUI interface is connected with the SQLite database to view the leaderboard information, and all information can be viewed, or relevant information can be obtained by searching according to the animation name keyword.
In addition to searching by title, it can also expand the functions of searching by heat and link on this basis.
2.2 operation results
2.3 GUI design difficulties
2.3.1 query by title - fuzzy query
content="'%"+selecttitle.get()+"%'"