Learning purpose: to use the framework of scratch to obtain dynamic website information. Take Taobao as an example, to obtain the [description, price, store, picture link] of goods and save the obtained information to execl table, or json file, database.
Open Taobao homepage to search Yijia mobile phone, and get the url of the first page as https://s.taobao.com/search? Q =% E4% B8% 80% E5% 8A% A0% E6% 89% 8b% E6% 9C% Ba & imgfile = & JS = 1 & stats_click = search_radio_all% 3A1 & initiative_id = staobaoz_ & ie = utf8,
Click the next page to get the url of the second page: https://s.taobao.com/search? Q =% E4% B8% 80% E5% 8A% A0% E6% 89% 8b% E6% 9C% Ba & imgfile = & JS = 1 & stats_click = search_radio_all% 3A1 & initiative_id = staobaoz_ & ie = utf8 & bcoffset = 4 & p4ppushlet = 1% 2c48 & S = 44 & ntoffset = 4
Through comparative analysis, the url of the next page mainly changes through s, increasing by 44 each time, so the url can be simplified as https://s.taobao.com/search? Q =% E4% B8% 80% E5% 8A% A0% E6% 89% 8b% E6% 9C% Ba & S = 0
Now I'm going to write the code
Create a new project file
scrapy startproject taobao cd taobao scrapy genspider phone s.taobao.com
item.py
import scrapy class TaobaoItem(scrapy.Item): img_src = scrapy.Field() info = scrapy.Field() price = scrapy.Field() shop = scrapy.Field()
spiders/phone.py
import scrapy from selenium import webdriver from ..items import TaobaoItem class PhoneSpider(scrapy.Spider): name = 'phone' allowed_domains = ['s.taobao.com'] total_page = 20 start_urls = ['https://s.taobao.com/search?q=%E4%B8%80%E5%8A%A0%E6%89%8B%E6%9C%BA&s={}'.format(i*44) for i in range(total_page)] def __init__(self): # Configure Google browser map free and interface free mode self.options = webdriver.ChromeOptions() self.prefs = { 'profile.default_content_setting_values':{'images':2} } self.options.add_argument('--headless') self.options.add_argument('--disable-gpu') self.options.add_experimental_option('prefs',self.prefs) self.driver = webdriver.Chrome(chrome_options=self.options) def parse(self, response): data_list = response.xpath('//div[@class="item J_MouserOnverReq "]') for data in data_list: info = ''.join(data.xpath('.//div[@class="row row-2 title"]/a/text()').extract()).strip().replace('/','') price = data.xpath('.//div[@class="price g_price g_price-highlight"]/strong/text()').extract_first() shop = data.xpath('.//a[@class="shopname J_MouseEneterLeave J_ShopInfo"]/span[2]/text()').extract_first() img_src = "https:" + data.xpath('.//a[@class="pic-link J_ClickStat J_ItemPicA"]/img/@data-src').extract_first() item = TaobaoItem() item['info'] = info item['price'] = price item['shop'] = shop item['img_src'] = [img_src] yield item # Get total pages # self.total_page = response.xpath('//div[@class="total"]/text()').re_first('\d+') @staticmethod def close(spider, reason): spider.driver.close() return
pipelines.py
import scrapy import sqlite3 import xlwt from urllib.request import urlretrieve from scrapy.exporters import JsonItemExporter class UrllibPipeline(object): def process_item(self, item, spider): urlretrieve(item["img_src"][0],"imgs/"+item["info"]+".jpg") return item class JsonFilePipeline(object): def __init__(self): self.file = open('taobao.json','wb') self.exporter = JsonItemExporter(self.file,ensure_ascii=False,encoding='utf-8') def open_spider(self,spider): self.exporter.start_exporting() def process_item(self,item,spider): self.exporter.export_item(item) return item def close_spider(self,spider): self.exporter.finish_exporting() self.file.close() class TaobaoPipeline(object): def __init__(self): self.workbook = xlwt.Workbook(encoding='utf-8') self.sheet = self.workbook.add_sheet('One plus mobile phone') self.info_list = ['info','price','shop','img_src'] self.row = 1 def open_spider(self,spider): for index,info in enumerate(self.info_list): self.sheet.write(0,index,info) def close_spider(self,spider): self.workbook.save("Taobao.xlsx") def process_item(self, item, spider): data_list = [item["info"],item["price"],item["shop"],item["img_src"]] for index,data in enumerate(data_list): self.sheet.write(self.row,index,data) self.row += 1 return item class SqlitePipeline(object): def __init__(self): self.conn = sqlite3.connect('taobaoDB') self.cursor = self.conn.cursor() def open_spider(self,spider): self.cursor.execute('create table if not exists phone (img text,info text,price text,shop text)') self.conn.commit() def process_item(self,item,spider): self.cursor.execute(f'insert into phone VALUES ("{item["img_src"]}","{item["info"]}","{item["price"]}","{item["shop"]}")') self.conn.commit() return item def close_spider(self,spider): self.conn.close()
middlewares.py this file is a middleware file. Add a download middleware here
from scrapy import signals from scrapy.http.response.html import HtmlResponse class SeleniumMiddleware(object): def process_request(self,request,spider): if spider.name == "phone": spider.driver.get(request.url) spider.driver.implicitly_wait(10) response = HtmlResponse(url=spider.driver.current_url, request=request, body=spider.driver.page_source, encoding='utf-8') return response
Modify settings.py
ROBOTSTXT_OBEY = False DOWNLOADER_MIDDLEWARES = { # 'taobao.middlewares.TaobaoDownloaderMiddleware': 543, 'taobao.middlewares.SeleniumMiddleware': 2, } ITEM_PIPELINES = { 'taobao.pipelines.TaobaoPipeline': 1, 'taobao.pipelines.JsonFilePipeline': 2, 'taobao.pipelines.SqlitePipeline': 4, # If you want to download a picture, just uncomment the following line # 'taobao.pipelines.UrllibPipeline': 56, } IMAGES_STORE = 'imgs'
Create a new main.py file in the spiders directory
from scrapy import cmdline cmdline.execute('scrapy crawl phone'.split())
Finally, run the main.py file directly.
Source code on GitHub -- > Click jump