Crawler processes and Python third-party library usage

Posted by cac_azure03 on Sat, 15 Jun 2019 18:38:15 +0200

Usages such as requests pymongo BS4

from future import print_function
Prit does not need parentheses in #python2.X, but it does in python3.X.Add this sentence at the beginning, even after
python2.X, using print requires bracketing like python3.X

import requests

Import requests without requests at https://pip.pypa.io/en/stable/**talling/

         The first two sentences of this website download pip Download requests with pip x x tall requests   
                     requests is to initiate a request to get the source code of a web page

from bs4 import BeautifulSoup

pip x x tall bs4 download bs4 BeautifulSoup is one of Python's third-party libraries bs4

BeautifulSoup library, which is used to parse html code, can help you more easily locate the information you need through Tags

import pymongo
#Source installation mongodb database pip installation pymongo is python link mongodb third party library is the driver
Order, enabling Python programs to use the Mongodb database, written using python.

import json
#json is a lightweight text data exchange format.Is the syntax used to store and exchange text information.

mount this database

1. Source installation mongodb https://fastdl.mongodb.org/linux/mongodb-linux-x86_64-rhel70-3.2.5.tgz Unzip the mongodb source package and place it in/usr/local
2 mkdir -p /data/db
3.cd /usr/local/mongodb/bin
./mongod &
./mongo
exit Exit

View the contents of the database:
cd/usr/local/mongodb/bin
./mongo
show dbs

Database: iaaf
use iaaf
show collections
db.athletes.find()

Crawler processes

Step 1: Extract HTML information from your website

    #Required web address

url = 'https://www.iaaf.org/records/toplists/jumps/long-jump/outdoor/men/senior/2018?regionType=world&windReading=regular&page={}&bestResultsOnly=true'  

    #Use headers to set the request header, masquerading the code as a browser

headers = {  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15', }

for i in range(1,23):
    res = requests.get(url.format(i), headers=headers)
    html = res.text
    print(i)
    soup = BeautifulSoup(html, 'html.parser')       #Use BeautifulSoup to parse this code
    #tbody_l = soup.find_all('tbody')
    record_table = soup.find_all('table', class_='records-table')
    list_re = record_table[2]
    tr_l = list_re.find_all('tr')
    for i in tr_l:    # One line for each tr
        td_l = i.find_all('td')    # The third item in the td list is with href
       # Simply insert each assignment in td_l into the mongo that makes up the json data {}
        # Retake href access from mongo to get career data and save it back to this table
        # Save all data to excel

        j_data = {}
        try:
            j_data['Rank'] = td_l[0].get_text().strip()
            j_data['Mark'] = td_l[1].get_text().strip()
            j_data['WIND'] = td_l[2].get_text().strip()
            j_data['Competitior'] = td_l[3].get_text().strip()
            j_data['DOB'] = td_l[4].get_text().strip()
            j_data['Nat'] = td_l[5].get_text().strip()
            j_data['Pos'] = td_l[6].get_text().strip()
            j_data['Venue'] = td_l[8].get_text().strip()
            j_data['Date'] = td_l[9].get_text().strip()
            j_data['href'] = td_l[3].find('a')['href']      
            #Save the data you want in a dictionary

Step 2: Extract the information we want from HTML

#!/usr/bin/env python
#encoding=utf-8

from future import print_function
import requests
from bs4 import BeautifulSoup as bs

def long_jump(url):

url = 'https://www.iaaf.org/athletes/cuba/juan-miguel-echevarria-294120'

headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'}
res = requests.get(url, headers=headers)
html = res.text
soup = bs(html,'html.parser')
div = soup.find('div', id='progression')

h2_l = []
if div != None:
    h2_l = div.find_all('h2')

tbody_l = []
outdoor = []
indoor = []
for i in h2_l:    # Get the h2 tag  
    text = str(i.get_text().strip())
    if "Long Jump" in text and "View Graph" in text:
        tbody = i.parent.parent.table.tbody
        #print(tbody) # Can get the data inside 
        # Two for outdoor and one for indoor   
        tbody_l.append(tbody)
# Get two elements of tbody one for outdoor and one for indoor use try except
# Group two json data outdoor={} indoor={} 
# db. * x * ert() Print first  
try:
    tbody_out = tbody_l[0]
    tbody_in  = tbody_l[1]
    tr_l = tbody_out.find_all('tr')
    for i in tr_l:
        # print(i)
        # print('+++++++++++++')
        td_l = i.find_all('td')
        td_dict = {}
        td_dict['Year'] = str(td_l[0].get_text().strip())
        td_dict['Performance'] = str(td_l[1].get_text().strip())
        td_dict['Wind'] = str(td_l[2].get_text().strip())
        td_dict['Place'] = str(td_l[3].get_text().strip())
        td_dict['Date'] = str(td_l[4].get_text().strip())
        outdoor.append(td_dict)

    # print(outdoor)
    # print('+++++++++++++++')
    tr_lin = tbody_in.find_all('tr')
    for i in tr_lin:
        td_l = i.find_all('td')
        td_dict = {}
        td_dict['Year'] = str(td_l[0].get_text().strip())
        td_dict['Performance'] = str(td_l[1].get_text().strip())
        td_dict['Place'] = str(td_l[2].get_text().strip())
        td_dict['Date'] = str(td_l[3].get_text().strip())
        indoor.append(td_dict)
    # print(indoor) 
except:
    pass
return outdoor, indoor
if __name__ == '__main__':
long_jump(url'https://www.iaaf.org/athletes/cuba/juan-miguel-echevarria-294120')

After getting the HTML code for the whole page, we need to extract the data of the athletes'long jump from the whole page

Step 3: Store the extracted data in a database

#!/usr/bin/env python
#coding=utf-8

from future import print_function
import pymongo
import requests
from bs4 import BeautifulSoup
import json
from long_jump import *

db = pymongo.MongoClient().iaaf
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'}

def get_href():

Responsible for removing href from mongo, fetching and accessing the resulting data and storing it in the original table

href_list = db.athletes.find()
# 794
count = 0
for i in href_list:
    # Take id to insert crawled career data back based on id  
    print(count)
    href = i.get('href')
    outdoor = []
    indoor = []
    if href == None:
        pass
    else:
        url = 'https://www.iaaf.org'+ str(href)
        outdoor, indoor = long_jump(url)

    db.athletes.update({'_id':i.get('_id')},{"$set":{"outdoor":outdoor,"indoor":indoor}})
    count += 1

def get_progression():
pass

if name == 'main':
get_href()

Step 4: Write database contents to excel according to country

#!/usr/bin/env python
#coding=utf-8

from future import print_function
import xlwt
import pymongo

def write_into_xls(cursor):
title = ['Rank','Mark','age','Competitior','DOB','Nat','country','Venue','Date','out_year','out_performance','out_wind','out_place','out_date','in_year','in_performance','in_place','in_date']

book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('iaaf',cell_overwrite_ok=True)

for i in range(len(title)):
    sheet.write(0, i, title[i])

# db = pymongo.MongoClient().iaaf
# cursor = db.athletes.find()

flag = 1
db = pymongo.MongoClient().iaaf
country_l = ['CUB', 'RSA', 'CHN', 'USA', 'RUS', 'AUS', 'CZE', 'URU', 'GRE', 'JAM', 'TTO', 'UKR', 'GER', 'IND', 'BRA', 'GBR', 'CAN', 'SRI', 'FRA', 'NGR', 'POL', 'SWE', 'JPN', 'INA', 'GUY', 'TKS', 'KOR', 'TPE', 'BER', 'MAR', 'ALG', 'ESP', 'SUI', 'EST', 'SRB', 'BEL', 'ITA', 'NED', 'FIN', 'CHI', 'BUL', 'CRO', 'ALB', 'KEN', 'POR', 'BAR', 'DEN', 'PER', 'ROU', 'MAS', 'CMR', 'TUR', 'PHI', 'HUN', 'VEN', 'HKG', 'PAN', 'BLR', 'MEX', 'LAT', 'GHA', 'MRI', 'IRL', 'ISV', 'BAH', 'KUW', 'NOR', 'SKN', 'UZB', 'BOT', 'AUT', 'PUR', 'DMA', 'KAZ', 'ARM', 'BEN', 'DOM', 'CIV', 'LUX', 'COL', 'ANA', 'MLT', 'SVK', 'THA', 'MNT', 'ISR', 'LTU', 'VIE', 'IRQ', 'NCA', 'ARU', 'KSA', 'ZIM', 'SLO', 'ECU', 'SYR', 'TUN', 'ARG', 'ZAM', 'SLE', 'BUR', 'NZL', 'AZE', 'GRN', 'OMA', 'CYP', 'GUA', 'ISL', 'SUR', 'TAN', 'GEO', 'BOL', 'ANG', 'QAT', 'TJK', 'MDA', 'MAC']
for i in country_l:
    cursor = db.athletes.find({'Nat':i})
    for i in cursor:
        print(i)
        count_out = len(i['outdoor'])
        count_in = len(i['indoor'])
        count = 1
        if count_out >= count_in:
            count = count_out
        else:
            count = count_in
        if count == 0:
            count = 1

        # The number of rows count ed for this data
# title = ['Rank','Mark','Wind','Competitior','DOB','Nat','Pos','Venue',
# 'Date','out_year','out_performance','out_wind','out_place','out_date',
# 'in_year','in_performance','in_place','in_date']

        sheet.write(flag, 0, i.get('Rank'))
        sheet.write(flag, 1, i.get('Mark'))
        sheet.write(flag, 2, i.get('age'))
        sheet.write(flag, 3, i.get('Competitior'))
        sheet.write(flag, 4, i.get('DOB'))
        sheet.write(flag, 5, i.get('Nat'))
        sheet.write(flag, 6, i.get('country'))
        sheet.write(flag, 7, i.get('Venue'))
        sheet.write(flag, 8, i.get('Date'))

        if count_out > 0:
            for j in range(count_out):
                sheet.write(flag+j, 9, i['outdoor'][j]['Year'])
                sheet.write(flag+j, 10, i['outdoor'][j]['Performance'])
                sheet.write(flag+j, 11, i['outdoor'][j]['Wind'])
                sheet.write(flag+j, 12, i['outdoor'][j]['Place'])
                sheet.write(flag+j, 13, i['outdoor'][j]['Date'])

        if count_in > 0:
            for k in range(count_in):
                sheet.write(flag+k, 14, i['indoor'][k]['Year'])
                sheet.write(flag+k, 15, i['indoor'][k]['Performance'])
                sheet.write(flag+k, 16, i['indoor'][k]['Place'])
                sheet.write(flag+k, 17, i['indoor'][k]['Date'])

        flag = flag + count

book.save(r'iaaf.xls')

# Start fetching data from the database from the first row  

if name == 'main':
write_into_xls(cursor=None)

After running the above code, what we get is

Topics: Programming MongoDB Database Python pip