Python implementation of test web question resolution script (web crawler)

Posted by jon2396 on Wed, 01 Apr 2020 20:24:39 +0200

Preface

It is a common practice to write a web crawler in Python. The principle is to download the web page, clean the data with regular expressions, and obtain the target resources. It can be text, picture, or other URL. Then it is stored in different categories. This paper is only for simple text extraction.

text

The code is written in Python version 2.7 and can pass the test. The results of the operation are shown below.

# -*- coding: UTF-8 -*-
import urllib2
import re

def ppkao_getanwser(questionURL) :
    questionPage = urllib2.urlopen(questionURL).read()
    # Analytic topic

    questionTypeURL = re.search('The following questions are from:(.*?)<a target="_blank" href="(.*?)" >(.*?)</a>', questionPage, re.S).group(2)
    questionType = re.search('The following questions are from:(.*?)<a target="_blank" href="(.*?)" >(.*?)</a>', questionPage, re.S).group(3)
    questionTypeID = re.search('ViewAnswers\(\'//(.*?)\',\'(\d+)\'', questionPage, re.S).group(2)
    questionModel = re.search('<i class="tx">(.*?)</i>', questionPage, re.S).group(1)
    questionName = re.search('<strong class="(.*?)">(.*?)</strong>', questionPage, re.S).group(2)
    if questionModel == 'Short answer' :
        questionItems = []
    else :
       questionItems = re.search('</strong>(.*?)</p>', questionPage, re.S).group(1).replace('<p>', '').replace('\t', '').replace('\r\n', '').replace(',<br />', '<br />').split('<br />')
       #questionItems = re.search('</strong>(.*?)</p>', questionPage, re.S).group(1).replace('<p>', '').replace('\t', '').replace('\r\n', '').strip().replace(',<br />', '<br />').split('<br />')

    # Get answers
    anwserURL = re.search('ViewAnswers\(\'//(.*?)\',', questionPage, re.S).group(1)
    anwserPage =  urllib2.urlopen('https://' + anwserURL).read()
    if questionModel == 'Short answer' :
        questionAnwser = re.search('<i class="target">Question answer</i>(.*?)<span>(.*?)<p>(.*?)</p>', anwserPage, re.S).group(3).strip()
    else :
        questionAnwser = re.search('<em class="ture-answer">(.*?)</em>', anwserPage, re.S).group(1).strip()
    # Exhibition
    print 'Question bank:', questionType
    print 'Item bank identification:', questionTypeID
    print 'Question bank link:', questionTypeURL
    print 'Title Link:', questionURL
    print 'Question type:', questionModel
    print 'Title:', questionName
    print 'Topic:', ','.join(questionItems)
    print 'Answer link:', 'https://' + anwserURL
    print 'Answer:', questionAnwser

test result

ppkao_getanwser("https://www.ppkao.com/tiku/shiti/1594015.html")

ppkao_getanwser("https://www.ppkao.com/tiku/shiti/1594014.html")

Topics: Python