python crawler diary 01

Posted by totof06 on Thu, 17 Feb 2022 10:51:49 +0100

PYTHON reptile diary 01

Record your learning reptile diary

Choose python as the programming language

1. Environmental preparation

python3.6+

mysql

pycharm

2. Ideas

The goal is to climb the top 100 of the cat's eye

1. Analyze the law of url

https://maoyan.com/board/4?offset=10 Find their URLs and use the url parameter offset as the offset parameter of the page

Later, we can automatically crawl all pages through loop traversal

2. Crawl the corresponding HTML according to the url

Crawling html may trigger the anti crawling mechanism, which can be avoided by configuring headers (solved, but not completely solved)

3. Parsing html pages

By introducing the parsing package of python, html is parsed into a tree, and then parsed according to the tree structure. There are many parsing methods

You can use the api provided in the package or regular, and it is better to use it flexibly in combination

4. Save data

Encapsulate the data after parsing the html page and save it to mysql

5. Display the data in the visual interface

3. Start drying

1. Create a new python project

Select your favorite directory and create a new project

2. Introduction package

from time import sleep	#Delay access to avoid ip blocking caused by too frequent access
import pymysql			#Connect to mysql and store the data
from bs4 import BeautifulSoup	#Parsing html
import re  # regular expression 	
import urllib.request, urllib.error  # Get web page data by customizing url

3. Function test

urllib.request page

import urllib.request
url = 'https://maoyan.com/board/4?offset = '# request url
headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
        }
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)

pymysql connection mysql

import pymysql

#Open database connection

conn = pymysql.connect(
            host='xxx.xx.xx.xx',# host
            port = 3306, # Default port, modified according to actual conditions
            user='root',# user name
            passwd='123456', # password
            db ='luke_db', # DB name
            )

cur = conn.cursor()

print(conn,cur)
cur.close()
conn.close()

It can be output successfully, indicating that it has been successfully connected with mysql

mysql can be built locally or remotely. Here is a mysql I built in linux and realized through remote connection

bs4.BeautifulSoup parses html pages

import urllib.request
from bs4 import BeautifulSoup
url = 'https://maoyan.com/board/4?offset = '# request url
headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
        }

req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
# print(response.read().decode('utf-8'))

html = response.read().decode('utf-8')
html_parser = BeautifulSoup(html, "html.parser")	#Parse the html returned by the request and specify html Parser parser
html_a = html_parser.find_all("a") #Extract a tag
print(html_a)

You can see that the output results are all a tags
Some common methods of recording beautiful soup

#1.Tag tag and its contents; Get the first thing it finds


# print(bs.title.string)
#
# print(type(bs.title.string))

#2. Contents in navigablestring tag (string)

#print(bs.a.attrs)



#print(type(bs))
#3. Beautiful soup refers to the whole document


#print(bs.name)
#print(bs)


# print(bs.a.string)
# print(type(bs.a.string))

#4.Comment is a special NavigableString, and the output content does not contain comment symbols


#-------------------------------

#Document traversal

#print(bs.head.contents)
#print(bs.head.contents[1])

Regular expressions are often used to parse html, and re package is introduced

import urllib.request
from bs4 import BeautifulSoup
import re
url = 'https://maoyan.com/board/4?offset = '# request url
headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
        }

req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
# print(response.read().decode('utf-8'))

html = response.read().decode('utf-8')
html_parser = BeautifulSoup(html, "html.parser")
html_a = html_parser.find_all("a") 	#Remove the A label
html_a=str(html_a)#To string
find_maoyan_link = re.compile(r'.*?href="(.*?)"')#Regular rules
html_href = re.findall(find_maoyan_link,html_a)#Regular matching
for item in html_href :#Traversal printing
    print(item)
# print(html_href)

General function has been tested ok

4. Complete code

import random
from time import sleep
import pymysql
from bs4 import BeautifulSoup
import re  # regular expression 
import urllib.request, urllib.error  # Get web page data by customizing url


def main():
    parser_url_save()


def parser_url_save():
    print("doing parser url ....")
    # cat eye https://maoyan.com/board/4?offset=10
    # Watercress https://movie.douban.com/top250?start=25
    urls = ['https://maoyan.com/board/4?offset=','https://movie.douban.com/top250?start=']
    for url in urls:
        if url.find('https://maoyan.com') !=-1 :
            datalist = parser_html(url)
            # datalist = getData_maoyan(html)
            # print(datalist)
            saveData_maoyan(datalist)
        else :
            # parser_DOUBAN(url)
            print("parser douban ...")


# Regular list
# Cat's eye title
find_maoyan_title = re.compile(r'.*?title="(.*?)"')
# Cat's eye link
find_maoyan_link = re.compile(r'.*?href="(.*?)"')
# Cat's eye picture
find_maoyan_pic = re.compile(r'.*?<img.*?data-src="(.*?)"')
# Cat's eye score
find_maoyan_score1 = re.compile(r'<p class="score"><i class="integer">(.*?)<')
find_maoyan_score2 = re.compile(r'</i><i class="fraction">(.*?)<')
# to star
find_maoyan_star = re.compile(r'.*to star:(.*)')
# Release time
find_maoyan_date = re.compile(r'Release time:(.*)<')

def parser_html (url):
    cookie = '###'
    # agent=random.choice(user_agent)
    agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
    data_maoyan_list=[]
    for i in range (0,10) :
        sleep(3)
        url_tmp = url+str(i*10)
        headers = {
            "User-Agent":agent
            ,"Cookie":cookie
        }
        req = urllib.request.Request(url_tmp, headers=headers)
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        data_maoyan_list = getData_maoyan(html,data_maoyan_list)

    return data_maoyan_list


def getData_maoyan (html,data_maoyan_list):
    html_parser = BeautifulSoup(html, "html.parser")
    base_url = 'https://maoyan.com/'
    item_list = html_parser.find_all('dd')
    for item in item_list:
        sleep(1)  # Delayed access
        # Data set of a single movie
        data = []
        item_a = str(item.a)
        # Take title
        title = re.findall(find_maoyan_title, item_a)[0]
        # Take link
        curr_url = base_url + str(re.findall(find_maoyan_link, item_a)[0])
        # Take picture link
        pic = re.findall(find_maoyan_pic, item_a)[0]
        # score
        item_p = item.select("p[class='score']")
        # if i * 10 == 20:
        #     print(item_p)
        score = "0.0"  # There is a reset 0.0 without a score
        if len(re.findall(find_maoyan_score1, str(item_p))) > 0:
            score = float(str(re.findall(find_maoyan_score1, str(item_p))[0]) + str(
                re.findall(find_maoyan_score2, str(item_p))[0]))
        # to star
        # '<p class="star">'
        item_star = item.select("p[class='star']")
        # print(str(item_star))
        star = re.findall(find_maoyan_star, str(item_star))[0]
        # Release time < p class = "releasetime" >
        item_releasetime = item.select("p[class='releasetime']")
        releasetime = re.findall(find_maoyan_date, str(item_releasetime))[0]
        # Add to dataset, title,curr_url,pic,score,star,releasetime

        data.append(title)
        data.append(curr_url)
        data.append(pic)
        data.append(score)
        data.append(star)
        data.append(releasetime)
        data_maoyan_list.append(data)


    return data_maoyan_list


def saveData_maoyan(data_list):
    conn = pymysql.connect(
            host='xxx.xx.xx.xx',# host
            port = 80, # Default port, modified according to actual conditions
            user='root',# user name
            passwd='123456', # password
            db ='luke_db', # DB name
            )
    cur=conn.cursor()
    print(conn)

    # Data list obtained
    for id in range(0,len(data_list)):
        #     Get field
        ind_id = str(id);
        title = '"'+str(data_list[id][0])+'"'  # title
        link = '"'+str(data_list[id][1])+'"'  # connect
        pic_link = '"'+str(data_list[id][2])+'"'  # Picture connection
        score = str(data_list[id][3])  # score
        actor = '"'+str(data_list[id][4])+'"'  # to star
        pub_date = '"'+str(data_list[id][5])+'"'  # Release time
        arr=[ind_id,title,link,pic_link,score,actor,pub_date]
        sql='''
            insert into luke_db.t_movie_top100_maoyan (xh,m_title,m_link,m_pic,m_score,m_actor,m_pubdate)
             values(%s)'''%",".join(arr)
        print(sql)
        print(cur.execute(sql))
        conn.commit() # insert data
    cur.close()
    conn.close()

if __name__== '__main__':
    main()

Such a simple crawler is realized

Check the database

You can see that the data has been imported into the database
Later, you can build a visual structure according to the data of the database

Topics: Python Database MySQL crawler Data Analysis

Programmer Think