"Fierce blade in the snow" did you chase it? Why is he so angry. Is there anything you don't know? Python to tell you.

Posted by cry of war on Tue, 04 Jan 2022 14:33:38 +0100

This issue is a crawler and data analysis of the popular online drama "fierce knife in the snow". The total number of crawlers is 1W comments, which is very suitable for newcomers to practice. In the case, the emotional text analysis of comments is specially added for your reference.

Crawler: since Tencent's comment data is encapsulated in json, you only need to find the json file and extract and save the required data.

Video website:

https://v.qq.com/x/cover/mzc0020020cyvqh.html

Comment json data website:

https://video.coral.qq.com/varticle/7579013546/comment/v2

Note: as long as you replace the value of the video digital id, you can crawl the comments of other videos

The content of this article is only for the learning and discussion of programming technology. The relevant code and data cannot be used for commercial purposes, otherwise they will bear the consequences.

How do I find the video id?

Find the request through the Network of Chrome developer tools. For specific use, refer to the old text: A necessary tool for reptiles. Mastering it solves half the problem

Project structure:

I Crawler part:

1. Crawling comment content code:

spiders.py

import requests
import re
import random

def get_html(url, params):
    uapools = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'
    ]

    thisua = random.choice(uapools)
    headers = {"User-Agent": thisua}
    r = requests.get(url, headers=headers, params=params)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    r.encoding = 'utf-8'  # There is garbled code without this sentence
    return r.text

def parse_page(infolist, data):
    commentpat = '"content":"(.*?)"'
    lastpat = '"last":"(.*?)"'
    commentall = re.compile(commentpat, re.S).findall(data)
    next_cid = re.compile(lastpat).findall(data)[0]
    infolist.append(commentall)
    return next_cid


def print_comment_list(infolist):
    j = 0
    for page in infolist:
        print('Number' + str(j + 1) + 'page\n')
        commentall = page
        for i in range(0, len(commentall)):
            print(commentall[i] + '\n')
        j += 1


def save_to_txt(infolist, path):
    fw = open(path, 'w+', encoding='utf-8')
    j = 0
    for page in infolist:
        #fw.write('page '+ str(j + 1) +' page \ n ')
        commentall = page
        for i in range(0, len(commentall)):
            fw.write(commentall[i] + '\n')
        j += 1
    fw.close()


def main():
    infolist = []
    vid = '7579013546';
    cid = "0";
    page_num = 3000
    url = 'https://video.coral.qq.com/varticle/' + vid + '/comment/v2'
    #print(url)

    for i in range(page_num):
        params = {'orinum': '10', 'cursor': cid}
        html = get_html(url, params)
        cid = parse_page(infolist, html)


    print_comment_list(infolist)
    save_to_txt(infolist, 'content.txt')


main()

2. Crawl comment time code

sp.py

import requests
import re
import random


def get_html(url, params):
    uapools = [
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36',
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14'
    ]

    thisua = random.choice(uapools)
    headers = {"User-Agent": thisua}
    r = requests.get(url, headers=headers, params=params)
    r.raise_for_status()
    r.encoding = r.apparent_encoding
    r.encoding = 'utf-8'  # There is garbled code without this sentence
    return r.text


def parse_page(infolist, data):
    commentpat = '"time":"(.*?)"'
    lastpat = '"last":"(.*?)"'

    commentall = re.compile(commentpat, re.S).findall(data)
    next_cid = re.compile(lastpat).findall(data)[0]

    infolist.append(commentall)

    return next_cid



def print_comment_list(infolist):
    j = 0
    for page in infolist:
        print('Number' + str(j + 1) + 'page\n')
        commentall = page
        for i in range(0, len(commentall)):
            print(commentall[i] + '\n')
        j += 1


def save_to_txt(infolist, path):
    fw = open(path, 'w+', encoding='utf-8')
    j = 0
    for page in infolist:
        #fw.write('page '+ str(j + 1) +' page \ n ')
        commentall = page
        for i in range(0, len(commentall)):
            fw.write(commentall[i] + '\n')
        j += 1
    fw.close()


def main():
    infolist = []
    vid = '7579013546';
    cid = "0";
    page_num =3000
    url = 'https://video.coral.qq.com/varticle/' + vid + '/comment/v2'
    #print(url)

    for i in range(page_num):
        params = {'orinum': '10', 'cursor': cid}
        html = get_html(url, params)
        cid = parse_page(infolist, html)


    print_comment_list(infolist)
    save_to_txt(infolist, 'time.txt')


main()

II Data processing part

1. The timestamp of the comment is converted to the normal time

time.py

# coding=gbk
import csv
import time

csvFile = open("data.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []
#print(csvRow)
f = open("time.txt",'r',encoding='utf-8')
for line in f:
    csvRow = int(line)
    #print(csvRow)

    timeArray = time.localtime(csvRow)
    csvRow = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    print(csvRow)
    csvRow = csvRow.split()
    writer.writerow(csvRow)

f.close()
csvFile.close()

2. Read comments into csv

CD.py

# coding=gbk
import csv
csvFile = open("content.csv",'w',newline='',encoding='utf-8')
writer = csv.writer(csvFile)
csvRow = []

f = open("content.txt",'r',encoding='utf-8')
for line in f:
    csvRow = line.split()
    writer.writerow(csvRow)

f.close()
csvFile.close()

3. Count the number of comments in each time period of a day

py.py

# coding=gbk
import csv

from pyecharts import options as opts
from sympy.combinatorics import Subset
from wordcloud import WordCloud

with open('../Spiders/data.csv') as csvfile:
    reader = csv.reader(csvfile)

    data1 = [str(row[1])[0:2] for row in reader]

    print(data1)
print(type(data1))


#First become a set to get all the elements in seq to avoid repeated traversal
set_seq = set(data1)
rst = []
for item in set_seq:
    rst.append((item,data1.count(item)))  #Added elements and number of occurrences
rst.sort()
print(type(rst))
print(rst)

with open("time2.csv", "w+", newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=',')
    for i in rst:                # For each row, write each element of the row in the corresponding column
        writer.writerow(i)

with open('time2.csv') as csvfile:
     reader = csv.reader(csvfile)
     x = [str(row[0]) for row in reader]
     print(x)
with open('time2.csv') as csvfile:
    reader = csv.reader(csvfile)
    y1 = [float(row[1]) for row in reader]
    print(y1)

4. Statistics on recent comments

py1.py

# coding=gbk
import csv

from pyecharts import options as opts
from sympy.combinatorics import Subset
from wordcloud import WordCloud

with open('../Spiders/data.csv') as csvfile:
    reader = csv.reader(csvfile)

    data1 = [str(row[0]) for row in reader]
    #print(data1)
print(type(data1))


#First become a set to get all the elements in seq to avoid repeated traversal
set_seq = set(data1)
rst = []
for item in set_seq:
    rst.append((item,data1.count(item)))  #Added elements and number of occurrences
rst.sort()
print(type(rst))
print(rst)



with open("time1.csv", "w+", newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=',')
    for i in rst:                # For each row, write each element of the row in the corresponding column
        writer.writerow(i)

with open('time1.csv') as csvfile:
     reader = csv.reader(csvfile)
     x = [str(row[0]) for row in reader]
     print(x)
with open('time1.csv') as csvfile:
    reader = csv.reader(csvfile)
    y1 = [float(row[1]) for row in reader]

    print(y1)

III Data analysis

Data analysis: it involves word cloud chart, bar, broken line and pie chart. The latter three are the analysis of the proportion of comment time and starring. However, Tencent's comment time is displayed in the form of timestamp, so it needs to be converted, and then count the number of occurrences. Finally, an emotional analysis of comment content is added.

1. Make word cloud picture

wc.py

import numpy as np
import re
import jieba
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from PIL import Image

# The above package is installed by yourself. If not, just Baidu

f = open('content.txt', 'r', encoding='utf-8')  # This is the data source, that is, the data to generate the word cloud
txt = f.read()  # read file
f.close()  # Close the file. In fact, use with, but don't bother to change it
# If it is an article, you need to use jieba word segmentation. After segmentation, you can also deal with it yourself and then generate word cloud
newtxt = re.sub("[A-Za-z0-9\!\%\[\]\,\. ]", "", txt)
print(newtxt)
words = jieba.lcut(newtxt)

img = Image.open(r'wc.jpg')  # Want to make shape
img_array = np.array(img)

# Related configurations, in which the collaborations configuration can avoid duplication
wordcloud = WordCloud(
    background_color="white",
    width=1080,
    height=960,
    font_path="../Wen Yuexin youth.otf",
    max_words=150,
    scale=10,#definition
    max_font_size=100,
    mask=img_array,
    collocations=False).generate(newtxt)

plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.to_file('wc.png')

Outline drawing: WC jpg

 

Word cloud: result Png (Note: English letters are filtered out here)

2. Make a bar chart of recent comments

DrawBar.py

# encoding: utf-8
import csv
import pyecharts.options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType


class DrawBar(object):

    """Draw column chart class"""
    def __init__(self):
        """Create a histogram instance and set the width, height and style"""
        self.bar = Bar(init_opts=opts.InitOpts(width='1500px', height='700px', theme=ThemeType.LIGHT))

    def add_x(self):
        """Add to drawing X Axis data"""
        with open('time1.csv') as csvfile:
            reader = csv.reader(csvfile)
            x = [str(row[0]) for row in reader]
            print(x)


        self.bar.add_xaxis(
            xaxis_data=x,

        )

    def add_y(self):
        with open('time1.csv') as csvfile:
            reader = csv.reader(csvfile)
            y1 = [float(row[1]) for row in reader]

            print(y1)



        """Add to drawing Y Axis data, multiple can be added"""
        self.bar.add_yaxis(  # First Y-axis data
            series_name="Number of comments",  # Y-axis data name
            y_axis=y1,  # Y-axis data
            label_opts=opts.LabelOpts(is_show=True,color="black"),  # Set label
            bar_max_width='100px',  # Sets the maximum width of the column
        )


    def set_global(self):
        """Set the global properties of the drawing"""
        #self.bar(width=2000,height=1000)
        self.bar.set_global_opts(
            title_opts=opts.TitleOpts(  # Set title
                title='Recent comments and statistics of fierce sabre in the snow',title_textstyle_opts=opts.TextStyleOpts(font_size=35)

            ),
            tooltip_opts=opts.TooltipOpts(  # Prompt box configuration item (what is displayed when the mouse is moved over the drawing)
                is_show=True,  # Show prompt box
                trigger="axis",  # Trigger type (triggered by axis coordinate axis, a solid line perpendicular to X axis will follow the mouse when the mouse moves, and prompt information will be displayed)
                axis_pointer_type="cross"  # Indicator type (cross will generate two dashed lines perpendicular to the X-axis and Y-axis respectively, which will be displayed completely only if trigger is not enabled)
            ),
            toolbox_opts=opts.ToolboxOpts(),  # Toolbox configuration item (nothing is filled in, and all tools are enabled by default)

        )

    def draw(self):
        """Drawing graphics"""

        self.add_x()
        self.add_y()
        self.set_global()
        self.bar.render('../Html/DrawBar.html')  # Plot the graph to test HTML file, which can be opened in the browser
    def run(self):
        """Execution function"""
        self.draw()



if __name__ == '__main__':
    app = DrawBar()

app.run()

Effect picture: drawbar html

3. Make an hourly comment bar chart

DrawBar2.py

# encoding: utf-8
# encoding: utf-8
import csv
import pyecharts.options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType


class DrawBar(object):

    """Draw column chart class"""
    def __init__(self):
        """Create a histogram instance and set the width, height and style"""
        self.bar = Bar(init_opts=opts.InitOpts(width='1500px', height='700px', theme=ThemeType.MACARONS))

    def add_x(self):
        """Add to drawing X Axis data"""
        str_name1 = 'spot'

        with open('time2.csv') as csvfile:
            reader = csv.reader(csvfile)
            x = [str(row[0] + str_name1) for row in reader]
            print(x)


        self.bar.add_xaxis(
            xaxis_data=x
        )

    def add_y(self):
        with open('time2.csv') as csvfile:
            reader = csv.reader(csvfile)
            y1 = [int(row[1]) for row in reader]

            print(y1)



        """Add to drawing Y Axis data, multiple can be added"""
        self.bar.add_yaxis(  # First Y-axis data
            series_name="Number of comments",  # Y-axis data name
            y_axis=y1,  # Y-axis data
            label_opts=opts.LabelOpts(is_show=False),  # Set label
            bar_max_width='50px',  # Sets the maximum width of the column

        )


    def set_global(self):
        """Set the global properties of the drawing"""
        #self.bar(width=2000,height=1000)
        self.bar.set_global_opts(
            title_opts=opts.TitleOpts(  # Set title
                title='Comments and statistics of fierce sabre in snow at each time period',title_textstyle_opts=opts.TextStyleOpts(font_size=35)

            ),
            tooltip_opts=opts.TooltipOpts(  # Prompt box configuration item (what is displayed when the mouse is moved over the drawing)
                is_show=True,  # Show prompt box
                trigger="axis",  # Trigger type (triggered by axis coordinate axis, a solid line perpendicular to X axis will follow the mouse when the mouse moves, and prompt information will be displayed)
                axis_pointer_type="cross"  # Indicator type (cross will generate two dashed lines perpendicular to the X-axis and Y-axis respectively, which will be displayed completely only if trigger is not enabled)
            ),
            toolbox_opts=opts.ToolboxOpts(),  # Toolbox configuration item (nothing is filled in, and all tools are enabled by default)

        )

    def draw(self):
        """Drawing graphics"""

        self.add_x()
        self.add_y()
        self.set_global()
        self.bar.render('../Html/DrawBar2.html')  # Plot the graph to test HTML file, which can be opened in the browser
    def run(self):
        """Execution function"""
        self.draw()

if __name__ == '__main__':
    app = DrawBar()

app.run()

Effect picture: drawbar2 html

4. Make a pie chart of recent comments

pie_pyecharts.py

import csv
from pyecharts import options as opts
from pyecharts.charts import Pie
from random import randint
from pyecharts.globals import ThemeType
with open('time1.csv') as csvfile:
    reader = csv.reader(csvfile)
    x = [str(row[0]) for row in reader]
    print(x)
with open('time1.csv') as csvfile:
    reader = csv.reader(csvfile)
    y1 = [float(row[1]) for row in reader]
    print(y1)
num = y1
lab = x
(
    Pie(init_opts=opts.InitOpts(width='1700px',height='450px',theme=ThemeType.LIGHT))#Default 900600
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Recent comments and statistics of fierce sabre in the snow",
                                               title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(

            pos_top="10%", pos_left="1%",# Legend position adjustment
            ),)
    .add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#Pie chart
   .add(series_name='',center=[845, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#Ring graph
    .add(series_name='', center=[1380, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#Nightingale map
).render('pie_pyecharts.html')

design sketch

5. Make an hourly comment pie chart

pie_pyecharts2.py

import csv
from pyecharts import options as opts
from pyecharts.charts import Pie
from random import randint
from pyecharts.globals import ThemeType
str_name1 = 'spot'
with open('time2.csv') as csvfile:
    reader = csv.reader(csvfile)
    x = [str(row[0]+str_name1) for row in reader]
    print(x)
with open('time2.csv') as csvfile:
    reader = csv.reader(csvfile)
    y1 = [int(row[1]) for row in reader]

    print(y1)
num = y1
lab = x
(
    Pie(init_opts=opts.InitOpts(width='1650px',height='500px',theme=ThemeType.LIGHT,))#Default 900600
     .set_global_opts(
        title_opts=opts.TitleOpts(title="Hourly comment statistics of fierce sabre in snow"
                                  ,title_textstyle_opts=opts.TextStyleOpts(font_size=27)),
        legend_opts=opts.LegendOpts(

            pos_top="8%", pos_left="4%",# Legend position adjustment
            ),
    )
    .add(series_name='',center=[250, 300], data_pair=[(j, i) for i, j in zip(num, lab)])#Pie chart
    .add(series_name='',center=[810, 300],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#Ring graph
    .add(series_name='', center=[1350, 300],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#Nightingale map
).render('pie_pyecharts2.html')

design sketch

6. Make a comment statistical pie chart for viewing time interval

pie_pyecharts3.py

# coding=gbk
import csv
from pyecharts import options as opts
from pyecharts.globals import ThemeType
from sympy.combinatorics import Subset
from wordcloud import WordCloud
from pyecharts.charts import Pie
from random import randintwith open(/data.csv') as csvfile:
    reader = csv.reader(csvfile)
    data2 = [int(row[1].strip('')[0:2]) for row in reader]
    #print(data2)
print(type(data2))
#First become a set to get all the elements in seq to avoid repeated traversal
set_seq = set(data2)
list = []
for item in set_seq:
    list.append((item,data2.count(item)))  #Added elements and number of occurrences
list.sort()
print(type(list))
#print(list)
with open("time2.csv", "w+", newline='', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=',')
    for i in list:                # For each row, write each element of the row in the corresponding column
        writer.writerow(i)
n = 4 #Divided into n groups
m = int(len(list)/n)
list2 = []
for i in range(0, len(list), m):
    list2.append(list[i:i+m])

print("before dawn : ",list2[0])
print("morning : ",list2[1])
print("afternoon : ",list2[2])
print("night : ",list2[3])

with open('time2.csv') as csvfile:
    reader = csv.reader(csvfile)
    y1 = [int(row[1]) for row in reader]

    print(y1)

n =6
groups = [y1[i:i + n] for i in range(0, len(y1), n)]

print(groups)

x=['before dawn','morning','afternoon','night']
y1=[]
for y1 in groups:
    num_sum = 0
    for groups in y1:
        num_sum += groups
str_name1 = 'spot'
num = y1
lab = x
(
    Pie(init_opts=opts.InitOpts(width='1500px',height='450px',theme=ThemeType.LIGHT))#Default 900600
        .set_global_opts(
        title_opts=opts.TitleOpts(title="Comment statistics of viewing time interval of fierce knife in snow"
                                  , title_textstyle_opts=opts.TextStyleOpts(font_size=30)),
        legend_opts=opts.LegendOpts(

            pos_top="8%",  # Legend position adjustment
        ),
    )
    .add(series_name='',center=[260, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#Pie chart
   .add(series_name='',center=[1230, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#Ring graph
    .add(series_name='', center=[750, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#Nightingale map
).render('pie_pyecharts3.html')

design sketch

7. Make the pie chart of the proportion of fierce knife in the snow

pie_pyecharts4.py

import csv
from pyecharts import options as opts
from pyecharts.charts import Pie
from random import randint
from pyecharts.globals import ThemeType
f = open('content.txt', 'r', encoding='utf-8')  # This is the data source, that is, the data to generate the word cloud
words = f.read()  # read file
f.close()  # Close the file. In fact, use with, but don't bother to change it

name=["Zhang Ruoyun","Li Gengxi","Hu Jun"]

print(name)
count=[float(words.count("Zhang Ruoyun")),
      float(words.count("Li Gengxi")),
      float(words.count("Hu Jun"))]
print(count)
num = count
lab = name
(
    Pie(init_opts=opts.InitOpts(width='1650px',height='450px',theme=ThemeType.LIGHT))#Default 900600
    .set_global_opts(
        title_opts=opts.TitleOpts(title="Proportion of fierce knife in snow",
                                               title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(
            pos_top="3%", pos_left="33%",# Legend position adjustment
            ),)
    .add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#Pie chart
   .add(series_name='',center=[800, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#Ring graph
    .add(series_name='', center=[1300, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#Nightingale map
).render('pie_pyecharts4.html')

design sketch

8. Emotional analysis of comment content

SnowNLP.py

import numpy as np
from snownlp import SnowNLP
import matplotlib.pyplot as plt

f = open('content.txt', 'r', encoding='UTF-8')
list = f.readlines()
sentimentslist = []
for i in list:
    s = SnowNLP(i)

    print(s.sentiments)
    sentimentslist.append(s.sentiments)
plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01), facecolor='g')
plt.xlabel('Sentiments Probability')
plt.ylabel('Quantity')
plt.title('Analysis of Sentiments')
plt.show()
 

Effect picture (frequency of each score segment of emotion)

SnowNLP emotion analysis is based on the emotion dictionary. It simply divides the text into two categories: positive and negative. The return value is the probability of emotion, that is, the emotion score is between [0,1]. The closer it is to 1, the more positive it is. The closer it is to 0, the more negative it is.

Complete video source code here

summary

The above is how to obtain Tencent video comments and organize them for visual analysis. It belongs to the routine operation of data analysis and visualization. There is no particularly advanced technology. It is also more suitable for beginners of data analysis. Readers can try it by themselves. All codes have been given in this paper.

Topics: Python Programmer crawler