Python 3 crawler (sqlite3 stores information) -- ranking of AGE animation websites

Posted by discobean on Thu, 13 Jan 2022 22:10:53 +0100

        

catalogue

target

1. Crawler code

1.1 operation results

1.2 reptile difficulties

1.2.1 writing regular expressions:

1.3 deficiencies in reptiles

1.3.1 the captured animation playback link is not complete

2.GUI displays crawler content

2.1 ideas

2.2 operation results

2.3 GUI design difficulties

2.3.1 query by title - fuzzy query

Crawl source link: https://m.agefans.org/rank

target

Crawl out all the information about the ranking, name and popularity of 1 ~ 100 animation in the ranking list, store the information in sqlite3 database, and finally show the crawl out results with GUI.

1. Crawler code

import requests
import re
import sqlite3

def get_page(url):#Return the HTML source code of the url
	response = requests.get(url)
	if response.status_code==200:
	  return response.text
	else:
	  return 'fail'

def parse_html(html):#HTML source code for parsing parameters
    #Regular expression matching animation ranking, playing link, name, popularity value
	pattern=re.compile('<li class="row mb-1 rank-item">.*?rank.*?>(\d+)</span>.*?href="(.*?)".*?age-text-blue small.*?>(.*?)</span>.*?col-3 small text-truncate.*?>(.*?)</span>', re.S)
	result=re.findall(pattern, html)
	return result

def link_html():#Grab source html
    html = get_page('https://m.agefans.org/rank')
    result1 = parse_html(html)
    return result1

def save_db():#Store the crawled data in sqlite
    result1=link_html()
    con=sqlite3.connect(r'...\AGE.db')
    con.execute("""DROP TABLE data""")
    con.execute("create table data (rank primary key,link,title,rating)")
    cur=con.cursor()
    cur.executemany("insert into data(rank,link,title,rating) values(?,?,?,?)",result1)
    con.commit()
    cur.close()
    con.close()

if __name__ == '__main__':
    save_db()

1.1 operation results

 

 

Use DB Browser for SQLite to view age DB crawling content (display 1 ~ 35 messages)

 

1.2 reptile difficulties

1.2.1 writing regular expressions:

Ranking:. *? rank.*?> (\d+)</span>

Link:. *? href="(.*?)"

Name:. *? age-text-blue small.*?> (.*?)</ span>

Heat: *? col-3 small text-truncate.*?> (.*?)</ span>

The composition of page elements corresponding to each website is different. You need to write the corresponding regular expression according to the website you actually need to crawl. CTRL+SHIFT+I enter the inspection page to view the elements.

1.3 deficiencies in reptiles

1.3.1 the captured animation playback link is not complete

Link link because the href in the < a > tag of the website is not an absolute path but a relative path, the crawled link can not jump directly, and there is no way to replace the relative path with an absolute path

 

 

2.GUI displays crawler content

import tkinter
import tkinter.messagebox
from tkinter.messagebox import *
import tkinter.ttk
import tkinter as tk
import sqlite3
from PIL import ImageTk, Image
from tkinter import ttk
import pymysql
win=tkinter.Tk()

#Page size
win.geometry("1390x750")
win.title('AGE Ranking List')

#title
label=tkinter.Label(win,compound = 'center',text='AGE Animation ranking',font=('Blackbody',40),fg='#db7093',bg='#add8e6',width='500')
label.pack()

#Background picture
imgpath = (r'...\1.jpg')#Background picture path
img = Image.open(imgpath)
canvas = tk.Canvas(win, width=2500, height=1000, bd=0)
photo = ImageTk.PhotoImage(img)
canvas.create_image(690, 280, image=photo)
canvas.pack()

from tkinter import *
Label(win, text="Keyword query:",bg='#add8e6',font = (' bold ', 15)) place(x=500, y=80, width=120, height=25)
selecttitle = StringVar()
Entry(win, textvariable=selecttitle).place(x=650, y=80, width=300, height=25)


# Database location
database = (r'...\AGE.db')

# Display function
def showAllInfo():
    # Delete previously displayed content
    x = dataTreeview.get_children()
    for item in x:
        dataTreeview.delete(item)
    # Connect to database
    con = sqlite3.connect(database)
    cur = con.cursor()
    cur.execute("select * from data")
    lst = cur.fetchall()
    for item in lst:
        dataTreeview.insert("", 100, text="line1", values=item)
    cur.close()
    con.close()

#Query by title
def showTitle():
   if selecttitle.get() == "":
       showerror(title='Tips', message='Input cannot be empty')
   else:
       x = dataTreeview.get_children()
       for item in x:
           dataTreeview.delete(item)
       con = sqlite3.connect(database)
       cur = con.cursor()
       content="'%"+selecttitle.get()+"%'"	#Fuzzy query
       cur.execute("select * from data where title like "+content)
       lst = cur.fetchall()
       if len(lst) == 0:  #Judge that if the query cannot be found, it will prompt that the window cannot be found
           showerror(title='Tips', message='This animation is not on the list, or check whether the input information is correct')
       else:#Otherwise, the query several records window is displayed
           showinfo(title='Tips', message='Found'+str(len(lst))+"Data bar")
           for item in lst:
               dataTreeview.insert("", 100, text="line1", values=item)
       cur.close()
       con.close()

tkinter.Button(win,text='Query all',width=40,command=showAllInfo,font=(12)).place(x=800, y=125, width=120, height=30)
Button(win, text="Query by title", command=showTitle,font=(12)).place(x=550, y=125, width=120, height=30)

#List sqlite data
dataTreeview = ttk.Treeview(win, show='headings', column=('rank','link', 'title', 'rating'))
dataTreeview.column('rank', width=2, anchor="center")
dataTreeview.column('link', width=20, anchor="center")
dataTreeview.column('title', width=350, anchor="center")
dataTreeview.column('rating', width=15, anchor="center")

dataTreeview.heading('rank', text='ranking')
dataTreeview.heading('link', text='link')
dataTreeview.heading('title', text='name')
dataTreeview.heading('rating', text='degree of heat')
dataTreeview.place(x=200, y=180, width=1000, height=300)

#scroll bar
s = tkinter.Scrollbar(dataTreeview, command=dataTreeview.yview)
s.pack(side="right", fill="y")
dataTreeview.config(yscrollcommand=s.set)
win.mainloop()

2.1 ideas

According to the database obtained from the previously crawled data, the GUI interface is connected with the SQLite database to view the leaderboard information, and all information can be viewed, or relevant information can be obtained by searching according to the animation name keyword.

In addition to searching by title, it can also expand the functions of searching by heat and link on this basis.

2.2 operation results

 

2.3 GUI design difficulties

2.3.1 query by title - fuzzy query

          content="'%"+selecttitle.get()+"%'"

Topics: Python crawler GUI Sqlite3