python learning: scratch + selenium crawling Taobao product information

Posted by mwood_2k2 on Mon, 06 Jan 2020 18:19:27 +0100

Learning purpose: to use the framework of scratch to obtain dynamic website information. Take Taobao as an example, to obtain the [description, price, store, picture link] of goods and save the obtained information to execl table, or json file, database.

Open Taobao homepage to search Yijia mobile phone, and get the url of the first page as https://s.taobao.com/search? Q =% E4% B8% 80% E5% 8A% A0% E6% 89% 8b% E6% 9C% Ba & imgfile = & JS = 1 & stats_click = search_radio_all% 3A1 & initiative_id = staobaoz_ & ie = utf8,

Click the next page to get the url of the second page: https://s.taobao.com/search? Q =% E4% B8% 80% E5% 8A% A0% E6% 89% 8b% E6% 9C% Ba & imgfile = & JS = 1 & stats_click = search_radio_all% 3A1 & initiative_id = staobaoz_ & ie = utf8 & bcoffset = 4 & p4ppushlet = 1% 2c48 & S = 44 & ntoffset = 4

Through comparative analysis, the url of the next page mainly changes through s, increasing by 44 each time, so the url can be simplified as https://s.taobao.com/search? Q =% E4% B8% 80% E5% 8A% A0% E6% 89% 8b% E6% 9C% Ba & S = 0

Now I'm going to write the code

Create a new project file

scrapy startproject taobao

cd taobao 
scrapy genspider phone s.taobao.com

item.py

import scrapy

class TaobaoItem(scrapy.Item):

    img_src = scrapy.Field()
    info = scrapy.Field()
    price = scrapy.Field()
    shop = scrapy.Field()

spiders/phone.py

import scrapy
from selenium import webdriver
from ..items import TaobaoItem
class PhoneSpider(scrapy.Spider):
    name = 'phone'
    allowed_domains = ['s.taobao.com']

    total_page = 20
    start_urls = ['https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&s={}'.format(i*44) for i in range(total_page)]

    def __init__(self):

        # Configure Google browser map free and interface free mode
        self.options = webdriver.ChromeOptions()
        self.prefs = {
            'profile.default_content_setting_values':{'images':2}
        }
        self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')
        self.options.add_experimental_option('prefs',self.prefs)

        self.driver = webdriver.Chrome(chrome_options=self.options)

    def parse(self, response):

        data_list = response.xpath('//div[@class="item J_MouserOnverReq  "]')

        for data in data_list:

            info = ''.join(data.xpath('.//div[@class="row row-2 title"]/a/text()').extract()).strip().replace('/','')
            price = data.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first()
            shop = data.xpath('.//a[@class="shopname J_MouseEneterLeave J_ShopInfo"]/span[2]/text()').extract_first()
            img_src = "https:" + data.xpath('.//a[@class="pic-link J_ClickStat J_ItemPicA"]/img/@data-src').extract_first()

            item = TaobaoItem()

            item['info'] = info
            item['price'] = price
            item['shop'] = shop
            item['img_src'] = [img_src]

            yield item

        # Get total pages
        # self.total_page = response.xpath('//div[@class="total"]/text()').re_first('\d+')

    @staticmethod
    def close(spider, reason):

        spider.driver.close()
        return

pipelines.py 

import scrapy
import sqlite3
import xlwt
from urllib.request import urlretrieve
from scrapy.exporters import JsonItemExporter


class UrllibPipeline(object):
    def process_item(self, item, spider):
        urlretrieve(item["img_src"][0],"imgs/"+item["info"]+".jpg")
        return item

class JsonFilePipeline(object):

    def __init__(self):
        self.file = open('taobao.json','wb')
        self.exporter = JsonItemExporter(self.file,ensure_ascii=False,encoding='utf-8')

    def open_spider(self,spider):
        self.exporter.start_exporting()

    def process_item(self,item,spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.file.close()


class TaobaoPipeline(object):
    def __init__(self):
        self.workbook = xlwt.Workbook(encoding='utf-8')
        self.sheet = self.workbook.add_sheet('One plus mobile phone')
        self.info_list = ['info','price','shop','img_src']
        self.row = 1
    def open_spider(self,spider):

        for index,info in enumerate(self.info_list):
            self.sheet.write(0,index,info)

    def close_spider(self,spider):

        self.workbook.save("Taobao.xlsx")

    def process_item(self, item, spider):

        data_list = [item["info"],item["price"],item["shop"],item["img_src"]]

        for index,data in enumerate(data_list):
            self.sheet.write(self.row,index,data)
        self.row += 1
        return item


class SqlitePipeline(object):

    def __init__(self):
        self.conn = sqlite3.connect('taobaoDB')
        self.cursor = self.conn.cursor()

    def open_spider(self,spider):

        self.cursor.execute('create table if not exists phone (img text,info text,price text,shop text)')
        self.conn.commit()

    def process_item(self,item,spider):

        self.cursor.execute(f'insert into phone VALUES ("{item["img_src"]}","{item["info"]}","{item["price"]}","{item["shop"]}")')
        self.conn.commit()

        return item
    def close_spider(self,spider):

        self.conn.close()

middlewares.py this file is a middleware file. Add a download middleware here

from scrapy import signals
from scrapy.http.response.html import HtmlResponse

class SeleniumMiddleware(object):

    def process_request(self,request,spider):

        if spider.name == "phone":
            spider.driver.get(request.url)
            spider.driver.implicitly_wait(10)

            response = HtmlResponse(url=spider.driver.current_url,
                                    request=request,
                                    body=spider.driver.page_source,
                                    encoding='utf-8')
            return response

Modify settings.py

ROBOTSTXT_OBEY = False

DOWNLOADER_MIDDLEWARES = {
   # 'taobao.middlewares.TaobaoDownloaderMiddleware': 543,
   'taobao.middlewares.SeleniumMiddleware': 2,
}

ITEM_PIPELINES = {
   'taobao.pipelines.TaobaoPipeline': 1,
   'taobao.pipelines.JsonFilePipeline': 2,
   'taobao.pipelines.SqlitePipeline': 4,
   # If you want to download a picture, just uncomment the following line
   # 'taobao.pipelines.UrllibPipeline': 56,
}

IMAGES_STORE = 'imgs'

Create a new main.py file in the spiders directory

from scrapy import cmdline

cmdline.execute('scrapy crawl phone'.split())

Finally, run the main.py file directly.

Source code on GitHub -- > Click jump

 

Topics: encoding JSON Mobile IE