python crawler - requests+xpath crawling 8684 bus query website

Posted by renojim on Thu, 13 Feb 2020 20:39:33 +0100

1, Analysis website

url =  'http://xian.8684.cn/'

1. Route xpath of the second layer:

# Find all links starting with a number
    number_href_list = tree.xpath('//div[@class="list"][1]/a/@href')
    # Find all links that start with a letter
    char_href_list = tree.xpath('//div[@class="list"][2]/a/@href')

2. Exact route xpath:

 route_list = tree.xpath('//div[@class="list clearfix"]/a/@href')

3. Extract the content to be crawled:

3.1 access to public transport information:

 bus_number = tree.xpath('//div[@class="info"]/h1/text()')[0]

3.2 get running time:

run_time = tree.xpath('//ul[@class="bus-desc"]/li[1]/text()')[0]

3.3 get update time:

    laster_time = tree.xpath('//ul[@class="bus-desc"]/li[4]/text()')[0]

3.4 obtain the number of uplink terminals:

    up_total = tree.xpath('//div[@class="layout-left"]/div[4]/div/div[@class="total"]/text()')[0]
    //or
    up_total = tree.xpath('//div[@class="layout-left"]/div[5]/div/div[@class="total"]/text()')[0]

3.5 get the names of all uplink stations:

    up_route = tree.xpath('//div[@class="layout-left"]/div[5]/ol/li/a/text()')[0]
    //or
    up_route = tree.xpath('//div[@class="layout-left"]/div[6]/ol/li/a/text()')[0]

3.6 obtain the number of downlink terminals (some routes are closed circles, regardless of uplink and downlink):

  down_total = tree.xpath('//div[@class="layout-left"]/div[6]/div/div[@class="total"]/text()')[0]
  //or
  down_total = tree.xpath('//div[@class="layout-left"]/div[7]/div/div[@class="total"]/text()')[0]
  //or
   down_total = ''

3.7 get the names of all stations in the downlink (some routes are closed circles, regardless of the uplink and downlink):

    down_route = tree.xpath('//div[@class="layout-left"]/div[7]/ol/li/a/text()')[0]
    //or
    down_route = tree.xpath('//div[@class="layout-left"]/div[8]/ol/li/a/text()')[0]
    //or
    down_route = ''

2, Code implementation

import requests
from lxml import etree
# List is used to save all bus information
items = []
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
}


# Navigation page crawling (starts with 1, 2, 3...)
def parse_navigation():
    url = 'https://foshan.8684.cn/'
    r = requests.get(url, headers=headers)
    # Parse content, get all navigation links
    tree = etree.HTML(r.text)
    # Find all links starting with a number
    number_href_list = tree.xpath('//div[@class="bus-layer depth w120"]/div[1]/div/a/@href')
    # Find all links that start with a letter
    char_href_list = tree.xpath('//div[@class="bus-layer depth w120"]/div[2]/div/a/@href')
    # Return all links as a list
    return number_href_list + char_href_list


def parse_second_route(content):
    tree = etree.HTML(content)
    # Write xpath to get every line
    route_list = tree.xpath('//div[@class="list clearfix"]/a/@href')
    route_name = tree.xpath('//div[@class="list clearfix"]/a/text()')
    i = 0
    # Traverse the list above
    for route in route_list:
        print('Start crawling%s line······' % route_name[i])
        route = 'https://foshan.8684.cn'+route
        r = requests.get(url=route,headers=headers)
        # Analyze the content and obtain the detailed information of each bus
        parse_third_route(r.text)
        print('End crawl%s line······' % route_name[i])
        i += 1


def parse_third_route(content):
    tree = etree.HTML(content)
    # -------------------Get content in turn--------------------
    # Access to public transport information
    bus_number = tree.xpath('//div[@class="info"]/h1/text()')[0]
    # Get run time
    run_time = tree.xpath('//ul[@class="bus-desc"]/li[1]/text()')[0]
    # Get ticket price information
    ticket_info = tree.xpath('//ul[@class="bus-desc"]/li[2]/text()')[0]
    # Get update time
    laster_time = tree.xpath('//ul[@class="bus-desc"]/li[4]/text()')[0]
    if tree.xpath('//div[@class="layout-left"]/div[5]/@class')[0] == 'bus-excerpt mb15':
        # Get the number of uplink terminals
        up_total = tree.xpath('//div[@class="layout-left"]/div[5]/div/div[@class="total"]/text()')[0]
        # Get the names of all uplink stations
        up_route = tree.xpath('//div[@class="layout-left"]/div[6]/ol/li/a/text()')
        try:
            # Get the number of downlink terminals
            down_total = tree.xpath('//div[@class="layout-left"]/div[7]/div/div[@class="total"]/text()')[0]
            # Get all station names of the downlink
            down_route = tree.xpath('//div[@class="layout-left"]/div[8]/ol/li/a/text()')
        except Exception as e:
            down_total = ''
            down_route = ''
    else:
        up_total = tree.xpath('//div[@class="layout-left"]/div[4]/div/div[@class="total"]/text()')[0]
        up_route = tree.xpath('//div[@class="layout-left"]/div[5]/ol/li/a/text()')
        try:
            down_total = tree.xpath('//div[@class="layout-left"]/div[6]/div/div[@class="total"]/text()')[0]
            down_route = tree.xpath('//div[@class="layout-left"]/div[7]/ol/li/a/text()')
        except Exception as e:
            down_total = ''
            down_route = ''

        # Store every bus information in the dictionary
    item = {
        'Line name': bus_number,
        'Running time': run_time,
        'Fare information': ticket_info,
        'Update time': laster_time,
        'Number of uplink terminals': up_total,
        'Name of all uplink stations': up_route,
        'Number of downlink terminals': down_total,
        'Name of all down line stations': down_route
    }
    items.append(item)


def parse_second(navi_list):
    # Traverse the above list, send the request in turn, analyze the content, and obtain all the bus route URLs of each page
    for first_url in navi_list:
        first_url = 'https://foshan.8684.cn' + first_url
        print('Start crawling%s All public transport information' % first_url)
        r = requests.get(url=first_url, headers=headers)
        # Analyze the content and get the detailed url of each bus
        parse_second_route(r.text)
        # Crawl finish
        fp = open('Foshan public transport.txt', 'w', encoding='utf8')
        for item in items:
            fp.write(str(item) + '\n')
        fp.close()


def main():
    # Crawl all navigation links on the first page
    navi_list = parse_navigation()
    # To climb the secondary page, you need to find all bus routes starting with 1
    parse_second(navi_list)


if __name__ == '__main__':
        main()