Python Crawls Cat's Eye Information from Sadness Downstream

Posted by daredevil88 on Mon, 23 Sep 2019 09:36:12 +0200

1. Cat's Eye Movie Criticism Interface

http://maoyan.com/films/1217236
If we visit this directly, we can only see the hottest 10 short reviews on the web side. How can we get all the short reviews?
(1) Visit the link above, press F12, then click the icon on the picture, change the browsing mode (response design mode, Firefox shortcut Ctrl+Shift+M) to mobile mode, refresh the page.

(2) Change to Google Browser and do the above operation under F12. After loading, pull down the short comment, and continue loading the page. Find the load bar containing offset and startTime, and find that its Response contains the data we want, in json format.

2. Get short reviews

(1) Simple analysis

Through the above analysis

Request URL: http://m.maoyan.com/mmdb/comments/movie/1217236.json?v=yes&offset=0&startTime=0%2021%3A09%3A31
Request Method: GET

After several slides, I found the following rules:
Analyzing the above data changes, we can roughly guess: offset indicates that the interface displays the starting position of comments, 15 comments per page, such as: 15, which shows 15-30 comments in the middle; startTime indicates the time of current comments, fixed format (2018-10-06).
In addition, the final% 2021%3A09%3A31 of the interface is unchanged.
(2) Code acquisition

import requests
from fake_useragent import UserAgent
import json
'''
//The python learning materials prepared by Xiaobian, plus group: 821460695 can be obtained free of charge! ____________
'''
headers = {
        "User-Agent": UserAgent(verify_ssl=False).random,
        "Host":"m.maoyan.com",
        "Referer":"http://m.maoyan.com/movie/1217236/comments?_v_=yes"
    }
# Cat's Eye Movie Short Comment Interface
offset = 0
# The film was released on September 21, 2018.
startTime = '2018-09-21'
comment_api = 'http://m.maoyan.com/mmdb/comments/movie/1217236.json?_v_=yes&offset={0}&startTime={1}%2021%3A09%3A31'.format(offset,startTime)
# Send get request
response_comment = requests.get(comment_api,headers = headers)
json_comment = response_comment.text
json_comment = json.loads(json_comment)
print(json_comment)

Return data:
(3) Brief introduction of data
(4) Data extraction

# Get data and store it
def get_data(self,json_comment):
    json_response = json_comment["cmts"]  # list
    list_info = []
    for data in json_response:
        cityName = data["cityName"]
        content = data["content"]
        if "gender" in data:
            gender = data["gender"]
        else:
            gender = 0
        nickName = data["nickName"]
        userLevel = data["userLevel"]
        score = data["score"]
        list_one = [self.time,nickName,gender,cityName,userLevel,score,content]
        list_info.append(list_one)
    self.file_do(list_info)

3. Storage of data

# Storage file
def file_do(list_info):
    # Get file size
    file_size = os.path.getsize(r'G:\maoyan\maoyan.csv')
    if file_size == 0:
        # Header
        name = ['Commentary date', 'Commentator nickname', 'Gender', 'City where','Cat eye rating','score','Comment content']
        # Establishing DataFrame Objects
        file_test = pd.DataFrame(columns=name, data=list_info)
        # Data writing
        file_test.to_csv(r'G:\maoyan\maoyan.csv', encoding='gbk', index=False)
    else:
        with open(r'G:\maoyan\maoyan.csv', 'a+', newline='') as file_test:
            # Append to the back of the file
            writer = csv.writer(file_test)
            # write file
            writer.writerows(list_info)

data display

Visualization of Data Analysis

1. Extracting data

Code:

def read_csv():
    content = ''
    # Read file content
    with open(r'G:\maoyan\maoyan.csv', 'r', encoding='utf_8_sig', newline='') as file_test:
        # read file
        reader = csv.reader(file_test)
        i = 0
        for row in reader:
            if i != 0:
                time.append(row[0])
                nickName.append(row[1])
                gender.append(row[2])
                cityName.append(row[3])
                userLevel.append(row[4])
                score.append(row[5])
                content = content + row[6]
                # print(row)
            i = i + 1
        print('There are:' + str(i - 1) + 'Bar data')
        return content

Total: 15195 data

Visualization of Gender Distribution of Commentators

# Visualization of Gender Distribution of Commentators
def sex_distribution(gender):
    # print(gender)
    from pyecharts import Pie
    list_num = []
    list_num.append(gender.count('0')) # Unknown
    list_num.append(gender.count('1')) # male
    list_num.append(gender.count('2')) # female
    attr = ["Other","male","female"]
    pie = Pie("Gender pie chart")
    pie.add("", attr, list_num, is_label_show=True)
    pie.render("H:\PyCoding\spider_maoyan\picture\sex_pie.html")

Visualization of the Distribution of Commentators'Cities

# Visualization of the Distribution of Commentators'Cities
def city_distribution(cityName):
    city_list = list(set(cityName))
    city_dict = {city_list[i]:0 for i in range(len(city_list))}
    for i in range(len(city_list)):
        city_dict[city_list[i]] = cityName.count(city_list[i])
    # Sort by number (dictionary key value)
    sort_dict = sorted(city_dict.items(), key=lambda d: d[1], reverse=True)
    city_name = []
    city_num = []
    for i in range(len(sort_dict)):
        city_name.append(sort_dict[i][0])
        city_num.append(sort_dict[i][1])

    import random
    from pyecharts import Bar
    bar = Bar("Commentator Urban Distribution")
    bar.add("", city_name, city_num, is_label_show=True, is_datazoom_show=True)
    bar.render("H:\PyCoding\spider_maoyan\picture\city_bar.html")

# Map visualization
def render_city(cities):

Visual Analysis of Total Daily Comments

def time_num_visualization(time):
    from pyecharts import Line
    time_list = list(set(time))
    time_dict = {time_list[i]: 0 for i in range(len(time_list))}
    time_num = []
    for i in range(len(time_list)):
        time_dict[time_list[i]] = time.count(time_list[i])
    # Sort by number (dictionary key value)
    sort_dict = sorted(time_dict.items(), key=lambda d: d[0], reverse=False)
    time_name = []
    time_num = []
    print(sort_dict)
    for i in range(len(sort_dict)):
        time_name.append(sort_dict[i][0])
        time_num.append(sort_dict[i][1])

    line = Line("Reviews Quantity Date Breakdown")
    line.add(
        "date-Comment number",
        time_name,
        time_num,
        is_fill=True,
        area_color="#000",
        area_opacity=0.3,
        is_smooth=True,
    )
    line.render("H:\PyCoding\spider_maoyan\picture\c_num_line.html")

Visualization of Cat's Eye Grading and Scoring

def level_score_visualization(userLevel,score):
    from pyecharts import Pie
    userLevel_list = list(set(userLevel))
    userLevel_num = []
    for i in range(len(userLevel_list)):
        userLevel_num.append(userLevel.count(userLevel_list[i]))

    score_list = list(set(score))
    score_num = []
    for i in range(len(score_list)):
        score_num.append(score.count(score_list[i]))

    pie01 = Pie("Hierarchical ring pie chart", title_pos='center', width=900)
    pie01.add(
        "Grade",
        userLevel_list,
        userLevel_num,
        radius=[40, 75],
        label_text_color=None,
        is_label_show=True,
        legend_orient="vertical",
        legend_pos="left",
    )
    pie01.render("H:\PyCoding\spider_maoyan\picture\level_pie.html")
    pie02 = Pie("Scoring rose pie chart", title_pos='center', width=900)
    pie02.add(
        "score",
        score_list,
        score_num,
        center=[50, 50],
        is_random=True,
        radius=[30, 75],
        rosetype="area",
        is_legend_show=False,
        is_label_show=True,
    )
    pie02.render("H:\PyCoding\spider_maoyan\picture\score_pie.html")

Topics: JSON encoding Lambda Firefox

Programmer Think

Python Crawls Cat's Eye Information from Sadness Downstream

1. Cat's Eye Movie Criticism Interface

2. Get short reviews

Hot Topics