Python text parser

Posted by amycrystal123 on Tue, 20 Aug 2019 10:54:59 +0200

I. Project Introduction

This tutorial describes a small program that uses Python to parse plain text and generate an HTML page.

II. Relevant Technology

Python: An object-oriented, interpretive computer programming language that can be used for Web development, graphics processing, text processing and mathematical processing.

HTML: Hypertext Markup Language (HTML), which is mainly used to implement web pages.

Project screenshots

Plain text files:

Welcome to hello world

 

The HTML page generated after parsing is shown below.

 


image.png

IV. Project Explanation

1. Text Block Generator

First, we need a text block generator to divide plain text into text blocks one by one so that each text can be parsed quickly. The util.py code is as follows:

#!/usr/bin/python
# encoding: utf-8

def lines(file):
    """
    //Generator, add a blank line at the end of the text
    """
    for line in file: yield line
    yield '\n'

def blocks(file):
    """
    //Generator, generating separate text blocks
    """
    block = []
    for line in lines(file):
        if line.strip():
            block.append(line)
        elif block:
            yield ''.join(block).strip()
            block = []

 

2. Processing procedures

Through the text generator, we get one text block after another. Then we need to add HTML tags to different text blocks by the handlers.py code is as follows:

#!/usr/bin/python
# encoding: utf-8

class Handler:
    """
    //Handler parent
    """
    def callback(self, prefix, name, *args):
        method = getattr(self, prefix + name, None)
        if callable(method): return method(*args)

    def start(self, name):
        self.callback('start_', name)

    def end(self, name):
        self.callback('end_', name)

    def sub(self, name):
        def substitution(match):
            result = self.callback('sub_', name, match)
            if result is None: result = match.group(0)
            return result
        return substitution

class HTMLRenderer(Handler):
    """
    HTML processing program,Adding corresponding blocks of text HTML sign
    """
    def start_document(self):
        print '<html><head><title>ShiYanLou</title></head><body>'

    def end_document(self):
        print '</body></html>'

    def start_paragraph(self):
        print '<p style="color: #444;">'

    def end_paragraph(self):
        print '</p>'

    def start_heading(self):
        print '<h2 style="color: #68BE5D;">'

    def end_heading(self):
        print '</h2>'

    def start_list(self):
        print '<ul style="color: #363736;">'

    def end_list(self):
        print '</ul>'

    def start_listitem(self):
        print '<li>'

    def end_listitem(self):
        print '</li>'

    def start_title(self):
        print '<h1 style="color: #1ABC9C;">'

    def end_title(self):
        print '</h1>'

    def sub_emphasis(self, match):
        return '<em>%s</em>' % match.group(1)

    def sub_url(self, match):
        return '<a target="_blank" style="text-decoration: none;color: #BC1A4B;" href="%s">%s</a>' % (match.group(1), match.group(1))

    def sub_mail(self, match):
        return '<a style="text-decoration: none;color: #BC1A4B;" href="mailto:%s">%s</a>' % (match.group(1), match.group(1))

    def feed(self, data):
        print data

 

3. Rules

With handlers and text block generators, you need rules to determine what tags each text block will add to the handler. The rules.py code is as follows:

#!/usr/bin/python
# encoding: utf-8

class Rule:
    """
    //Rule parent class
    """
    def action(self, block, handler):
        """
        //Marking
        """
        handler.start(self.type)
        handler.feed(block)
        handler.end(self.type)
        return True

class HeadingRule(Rule):
    """
    //Title Rule No. 1
    """
    type = 'heading'
    def condition(self, block):
        """
        //Judging whether a text block conforms to the rules
        """
        return not '\n' in block and len(block) <= 70 and not block[-1] == ':'

class TitleRule(HeadingRule):
    """
    //Title Rule No.2
    """
    type = 'title'
    first = True

    def condition(self, block):
        if not self.first: return False
        self.first = False
        return HeadingRule.condition(self, block);

class ListItemRule(Rule):
    """
    //List Item Rules
    """
    type = 'listitem'
    def condition(self, block):
        return block[0] == '-'

    def action(self, block, handler):
        handler.start(self.type)
        handler.feed(block[1:].strip())
        handler.end(self.type)
        return True

class ListRule(ListItemRule):
    """
    //List rules
    """
    type = 'list'
    inside = False
    def condition(self, block):
        return True

    def action(self, block, handler):
        if not self.inside and ListItemRule.condition(self, block):
            handler.start(self.type)
            self.inside = True
        elif self.inside and not ListItemRule.condition(self, block):
            handler.end(self.type)
            self.inside = False
        return False

class ParagraphRule(Rule):
    """
    //Paragraph rule
    """
    type = 'paragraph'

    def condition(self, block):
        return True

 

4. Analysis

Finally, we can parse it. The markup.py code is as follows:

#!/usr/bin/python
# encoding: utf-8

import sys, re
from handlers import *
from util import *
from rules import *

class Parser:
    """
    //Parser parent class
    """
    def __init__(self, handler):
        self.handler = handler
        self.rules = []
        self.filters = []

    def addRule(self, rule):
        """
        //Adding rules
        """
        self.rules.append(rule)

    def addFilter(self, pattern, name):
        """
        //Adding filters
        """
        def filter(block, handler):
            return re.sub(pattern, handler.sub(name), block)
        self.filters.append(filter)

    def parse(self, file):
        """
        //analysis
        """
        self.handler.start('document')
        for block in blocks(file):
            for filter in self.filters:
                block = filter(block, self.handler)
            for rule in self.rules:
                if rule.condition(block):
                    last = rule.action(block, self.handler)
                    if last: break
        self.handler.end('document')

class BasicTextParser(Parser):
    """
    //Plain text parser
    """
    def __init__(self, handler):
        Parser.__init__(self, handler)
        self.addRule(ListRule())
        self.addRule(ListItemRule())
        self.addRule(TitleRule())
        self.addRule(HeadingRule())
        self.addRule(ParagraphRule())

        self.addFilter(r'\*(.+?)\*', 'emphasis')
        self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
        self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')

"""
//Running program
"""
handler = HTMLRenderer()
parser = BasicTextParser(handler)
parser.parse(sys.stdin)

 

Run the program (test.txt for plain text file and test.html for HTML file)

python markup.py < test.txt > test.html

 

V. Summary

In this small program, we use Python to parse plain text files and generate HTML files. This is just a simple implementation. Through this case, you can try to parse Markdown files.

Topics: Python encoding Programming Web Development