Daily Star Language:
"Do you know what is the biggest problem of poverty when you are young?"
"What"
"It's a poor mistake to think that everything you can't get is poor. Eyes are fixed on distant places, but can't see the beauty of a flower nearby. "
September 7, 2018 20:30:30 Friday
Today, there is a problem. When a crawler defines the rules of a web page, because there are more than a dozen sites from a unified web site, it just needs to modify the web site variable. But considering two points, one is that there are too many sites and it is difficult to copy. The other is also the most important point. When the structure of the website changes, the modification and maintenance are too redundant and complex, so the template inheritance method is used!
The code of circHeNan.py is as follows
# encoding: utf-8 from datetime import datetime from .base_stock import BaseStock class CircHeNan(BaseStock): def is_detail_url(self, dom): return dom("#ess_mailrightpane") def parse_detail_url(self, dom, params): web_site = "Henan Committee" store_json = { "info:title": dom("#tab_content)").text(), "info:publish_time": dom("#tab_content > tbody:nth-child(1)").text(), "info:source": web_site, "info:author": "", "info:content": dom("#tab_content > tbody").outerHtml(), "info:tag": dom("head meta[name=\"keywords\"]").attr("content") or "", "info:dese": dom("head meta[name=\"description\"]").attr("content") or "", "info:crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "info:url": params["info:url"], "info:channel": "", "info:laiyuan": web_site, "info:contain_image": "True" if dom("#tab_content > tbody")("img") else "False" } return store_json
Because this class inherits from basestack
class BaseStock(object): def __init__(self): self.html_clean = html_clean.HTMLClean() self.html = "" self.web_site=""
Considering that BsaeStock is the parent class, when the subclass overrides ﹐ init ﹐ the subclass will not inherit ﹐ init ﹐ of the parent class, so it is necessary to completely rewrite ﹐ init ﹐ in the parent class
Define circ.py as a template
# encoding: utf-8 import re from datetime import datetime from .base_stock import BaseStock from components import html_clean class Csrc(BaseStock): def __init__(self): self.html_clean = html_clean.HTMLClean() self.html = "" self.web_site = "" def is_detail_url(self, dom): return dom(".content") def parse_detail_url(self, dom, params): store_json = { "info:title": dom("#tab_content)").text(), "info:publish_time": dom("#tab_content > tbody:nth-child(1)").text(), "info:source":self.web_site, "info:author": "", "info:content": dom("#tab_content > tbody").outerHtml(), "info:tag": dom("head meta[name=\"keywords\"]").attr("content") or "", "info:dese": dom("head meta[name=\"description\"]").attr("content") or "", "info:crawl_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "info:url": params["info:url"], "info:channel": "", "info:laiyuan": self.web_site, "info:contain_image": "True" if dom("#tab_content > tbody")("img") else "False" } return store_json
Modify the contents of circHeNan.py file as follows:
from .circ import Circ from components import html_clean class CircHeNan(Circ): def __init__(self): self.html_clean = html_clean.HTMLClean() self.html = "" self.web_site = "Henan Committee"