Code extraction function optimization

Posted by roy on Tue, 01 Feb 2022 11:09:49 +0100

preface

This paper is based on the previous article Code extraction function optimization (I) Some optimizations have been made, the processing of python multiline annotation, C + + multiline annotation and markdown has been added, some small bug s have been fixed, and the extraction accuracy has been improved to 92%.

1, Code extraction and construction ideas

2, Introduction to each module

1. Extract the code in markdown

Idea:
1. Judge whether ` ` ` is in this line, count+1 if it is, and take out the line number of this line
2. After obtaining the line number, take out the contents between the line numbers from the original list
The whole needs to look at the code to better understand

def extract_markdown(data_list):
    """extract markdown Code in, i.e"```"Content in
    Args:
        data_list: Text list
    Returns:
        codes: Extracted code snippet(list)
        texts: Extracted text(string type)
    """
    idxs = []
    count = 0
    for idx, line in enumerate(data_list):
        if "```" in line:
            idxs.append(idx)
            count += 1
    if count < 2:
        return [], data_list
    else:
        try:
            assert (len(idxs) % 2) == 0
            codes = []
            texts = []
            for idx, val in enumerate(idxs):
                if idx % 2 == 0:
                    if idx == 0:
                        text = data_list[:idxs[idx]]
                    else:
                        text = data_list[idxs[idx - 1] + 1: idxs[idx]]

                    code = data_list[idxs[idx] + 1: idxs[idx + 1]]
                    code = '\n'.join(code)
                    text = '\n'.join(text)
                    codes.append(code)
                    texts.append(text)
                if idx == len(idxs) - 1:
                    text = data_list[idxs[idx] + 1:]
                    text = '\n'.join(text)
                    texts.append(text)
            texts = ''.join(texts)
        except:
            return [], data_list
        return codes, texts

2. Extract the index of python multiline annotation

The overall idea is similar to extracting the code in markdown. I won't repeat it here. Just look at the code:

def extract_py_annotation_idx(data_list, annotation= "'''"):
    """extract python Multiline comment content
    Args:
        data_list: Text list
    Returns:
        idx: Index returned
    """
    idxs = []
    count = 0
    for idx, line in enumerate(data_list):
        if annotation in line:
            idxs.append(idx)
            count += 1
    if (count % 2) == 1:
        return []
    else:
        assert (len(idxs) % 2) == 0
        return idxs

3. Extract C + + multiline comments

This is slightly different from the previous two, but the overall idea is the same

def extract_cpp_annotation_idx(data_list):
    """extract/* */Multiline comment line index in
    Args:
        data_list: Text list
    Returns:
        idx: Annotation index list
    """
    idxs = []
    count = 0
    for idx, line in enumerate(data_list):
        if line.find('/*') != -1 and line.find('*/') != -1:
            continue
        elif line.find('/*') != -1:
            count += 1
            idxs.append(idx)
        elif line.find('*/') != -1:
            count += 1
            idxs.append(idx)
    if (count % 2) == 1:
        return []
    else:
        return idxs

No matter what kind of code is multi line annotation, you should pay attention to one thing: these annotators must exist in pairs!

4. Code judgment logic

1. Enter line to judge each line of code. If yes, the code returns True; otherwise, it returns False.
2. For the incoming line, first directly determine it as False, and then set it to True after meeting certain conditions.

code_symbol_list = ['~', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+',',',':',
                    '=', '/', '|', ';', '{', '}', '[', ']', '<', '>', '~','?']

code_print_list = ['print_r', 'var_dump', 'NSLog', 'println', 'console.log', 'print', 'printf',
                    'cout', 'Console.WriteLine', 'Println', 'Response.Write', 'alert', 'echo']

code_annotation_list = [r'#', r'//', r'<!--', r'-->', r'/*', r'*/', r"'''", r'--', 
                         r'%']

common_lib_list = ['import', 'from', 'yield', 'public', 'class', 'if', 'while', 'var', 'typedef', 'struct', 'package', 'Dim', 'input']

def_str_list = ['char', '#define', 'string', 'String', 'concat', 'findall']


text_keywords_list = ['Reprinted in', 'What's the reason?', 'error', 'ERROR', 'Error', ':\\', 'report errors',
                      '？', '!', 'Why?', '[img]', 'caption']


code_print_list.extend(def_str_list)
code_features = []
# code_features.extend(pl_keywords_no_intersect_list)
code_features.extend(code_symbol_list)
code_features.extend(common_lib_list)
code_features.extend(code_print_list)
code_features.extend(code_annotation_list)


def is_code_new(self, line, feature_list, annotation_list, text_keywords_list, code_print_list):
    code_flag = False #Set to False first, and then determine to be True after meeting certain conditions
    # count = 0
    # cn_num = getnum_of_cn(line)
    # en_num = getnum_of_en(line)
    cn_and_symbol_num = getnum_of_cn_and_symbol(line) # Calculate the number of Chinese characters and Chinese punctuation marks
    for symbol in feature_list:  # Traversal code keyword
        if is_sub_string(line, symbol):   # Judge whether the line contains code keywords
            code_flag = True	#Set to True if included
            for annotation in annotation_list:	#Iterate over the annotations of various codes
                if is_sub_string(line, annotation):	#Determine whether annotations are included
                    code_flag = True	#Include the comment character, set it to True, and jump out of the comment traversal
                    break
                else:		#Does not contain comments
                    for text_keyword in text_keywords_list:	#Traverse text keywords. If text keywords are included, set to False
                        if is_sub_string(line, text_keyword):
                            code_flag = False
                            break
                    if cn_and_symbol_num > 30:
                    	# Set to False if no comments are included and the number of Chinese characters is greater than 30
                    	# Considering that more than 30 Chinese characters may also be print, this code must be placed before judging the print condition
                        code_flag = False
					# If there are no comments and the number of Chinese characters is greater than 0, consider whether it is print output
					# Set to True if it is the output of print
                    if cn_and_symbol_num > 0:
                        for print_keyword in code_print_list:
                            if is_sub_string(line, print_keyword):
                                code_flag = True
                                break
                            else:
                            #If it is not print, consider whether it is a defined dictionary
                            # For example, the dictionary defined by python has this situation:
                            # dict = {
                            #		  'name': 'Zhang San',
                            # 		  'gender': 'male'
                            #		 }
                            #Set to True if it is similar to the above
                                if re.search(r'[\=\:]\s*?[\'\"].*?[\'\"]', line):
                                    code_flag = True
                                else:
                                    code_flag = False
                   
    # Considering that some error reports can also be misjudged as codes, the following error reports are excluded
    # Trace back in line xxx
    # Where xxx can be any number
    # Set to False as long as the line contains line xxx
    if is_line_num(line):
        code_flag = False
    return code_flag

5. Subsequent optimization

1. Although the accuracy rate has been 90 +%, there are only 400 test data, and some situations may be missed, such as SQL code, 51 assembly code and some other key fields. In the future, we will focus on these points to optimize and strive to improve the effect to 95 +%.
2. Considered adding reserved words in all languages to code_feature_list, but in this way, although the accuracy is improved, the code execution efficiency will be greatly reduced. As a basic module, the code extraction service must take into account the efficiency and accuracy. Therefore, there must be a balance between the two.

3, All codes

import pandas as pd
import re
from html.parser import HTMLParser
from bs4 import BeautifulSoup

code_symbol_list = ['~', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+',',',':',
                    '=', '/', '|', ';', '{', '}', '[', ']', '<', '>', '~','?']

code_print_list = ['print_r', 'var_dump', 'NSLog', 'println', 'console.log', 'print', 'printf',
                    'cout', 'Console.WriteLine', 'Println', 'Response.Write', 'alert', 'echo']

code_annotation_list = [r'#', r'//', r'<!--', r'-->', r'/*', r'*/', r"'''", r'--', 
                         r'%']

code_keywords_list = ['import', 'from', 'yield', 'public', 'class', 'if', 'while', 'var', 'typedef', 'struct', 'package', 'Dim', 'input',
                      'return', 'else', 'auto']

def_str_list = ['char', '#define', 'string', 'String', 'concat', 'findall']


text_keywords_list = ['Reprinted in', 'What's the reason?', 'error', 'ERROR', 'Error', ':\\', 'report errors',
                      '？', '!', 'Why?', '[img]', 'caption']


code_print_list.extend(def_str_list)
code_features = []
# code_features.extend(pl_keywords_no_intersect_list)
code_features.extend(code_symbol_list)
code_features.extend(code_keywords_list)
code_features.extend(code_print_list)
code_features.extend(code_annotation_list)


def is_line_num(inputdata):
    '''lookup line xxx In string'''
    if re.search(r'line [0-9]{1,}', inputdata):
        return True
    else:
        return False

def tab_2_spaces(text):
    return text.replace('    ', '\t')

def pretty_lines(texts):
    text_lines = []
    for text in texts:
        if text is None:
            continue
        sub_lines = text.split('\r\n')
        for sub_line in sub_lines:
            inner_lines = [tab_2_spaces(l) for l in sub_line.split(
                '\n') if l.strip() != '']
            text_lines += inner_lines
    return text_lines

def pretty_one_line(texts):
    text_lines = []
    for text in texts:
        sub_lines = text.split('\r\n')
        for sub_line in sub_lines:
            inner_lines = [l for l in sub_line.split(
                '\n') if l.strip() != '']
            text_lines += inner_lines
    return ''.join(text_lines)

class CodeHTMLParser(HTMLParser):
    '''filter HTML Inside code Code snippet,'''

    def __init__(self):
        HTMLParser.__init__(self)
        self.code_tag_count = 0
        self.current_code = []
        self.codes = []
        self.texts = []
        self.last_text = None

    def handle_starttag(self, tag, attrs):
        # print('tag:', tag, attrs)
        if tag == 'code':
            self.code_tag_count += 1

        if self.code_tag_count == 0:
            self.handle_text_tag(tag, attrs)

    def handle_text_tag(self, tag, attrs):
        if self.last_text is None:
            return

        index = self.last_text.find('#include')
        if index == -1:
            return

        if self.last_text[index:].strip() != '#include':
            return

        cpp_tag = ' <{}>'.format(tag)
        index = len(self.texts)-1
        if index >= 0:
            pre = pretty_one_line(self.texts[index])
            self.texts[index] = pre + cpp_tag

    def handle_endtag(self, tag):
        if tag == 'code':
            if self.code_tag_count == 1:
                self.codes.append('\n'.join(self.current_code))
                self.current_code = []
            self.code_tag_count -= 1

    def handle_data(self, data):
        if self.code_tag_count > 0:
            self.current_code.append(data)
        elif data is not None:
            self.last_text = data
            self.texts.append(data)
        else:
            self.last_text = ''


def filter_html_and_extract_code(text):
    '''remove html of tag Mark and extract at the same time<code>...</code>Code snippet within'''
    try:
        soup = BeautifulSoup(text, 'html.parser')
        codes = [tag.string for tag in soup.find_all('code')]
        prettify_text = soup.prettify()

        # print(prettify_text)
        parser = CodeHTMLParser()
        parser.feed(prettify_text)

        # Remove blank lines
        text_lines = pretty_lines(parser.texts)
        return (pretty_lines(codes), text_lines)
    except Exception as e:
        print('parse text as html failed, text:', str(e))
        # print(traceback.format_exc())
        return ([], pretty_lines([text]))

def getnum_of_cn(inputdata):
    '''Calculate the number of Chinese characters in the string'''
    chi = re.findall(r'[\u4E00-\u9FFF]', inputdata)  # Unicode coding range of Chinese characters
    return len(chi)

def getnum_of_cn_and_symbol(inputdata):
    chi = re.findall(r'[\u4E00-\u9FFF]', inputdata)  # Unicode coding range of Chinese characters
    result = len(chi) + inputdata.count('，') + inputdata.count('. ') + inputdata.count('；')
    return result



def getnum_of_en_noblank(inputdata):
    char = re.findall(r'[a-zA-Z]', inputdata)  # Returns all matching substrings in list type
    #other = len(inputdata) - len(char) - len(blank)
    return len(char)


def getnum_of_en(inputdata):
    '''Calculates the sum of the number of letters and spaces in the string'''
    char = re.findall(r'[a-zA-Z]', inputdata)  # Returns all matching substrings in list type
    blank = re.findall(r' ', inputdata)
    #other = len(inputdata) - len(char) - len(blank)
    return len(char) + len(blank)


def is_sub_string(ori_string, sub_string):
    """judge ori_string Include sub_string
    Args:
        ori_string: character string
        sub_string: character string
    Returns:
        Include or not, include as True,Not included as False
    """
    if ori_string.find(sub_string) != -1:
        return True
    else:
        return False


def extract_py_annotation_idx(data_list, annotation= "'''"):
    """extract python Multiline comment content
    Args:
        data_list: Text list
    Returns:
        idx: Index returned
    """
    idxs = []
    count = 0
    for idx, line in enumerate(data_list):
        if annotation in line:
            idxs.append(idx)
            count += 1
    if (count % 2) == 1:
        return []
    else:
        assert (len(idxs) % 2) == 0
        return idxs



def extract_cpp_annotation_idx(data_list):
    """extract/* */Comment line index in
    Args:
        data_list: Text list
    Returns:
        idx: Annotation index list
    """
    idxs = []
    count = 0
    for idx, line in enumerate(data_list):
        if line.find('/*') != -1 and line.find('*/') != -1:
            continue
        elif line.find('/*') != -1:
            count += 1
            idxs.append(idx)
        elif line.find('*/') != -1:
            count += 1
            idxs.append(idx)
    if (count % 2) == 1:
        return []
    else:
        return idxs


def repalce_annotation_idx(annotation_idx, code_flags):
    if annotation_idx != []:
        for idx, val in enumerate(annotation_idx):
            if idx % 2 == 0:
                for i in range(annotation_idx[idx], annotation_idx[idx+1]+1):
                    code_flags[i] = True


class SimpleCodeExtract(object):
    def __init__(self):
        print()
        # logging.info('init code extract service')

    def is_code_new(self, line, feature_list, annotation_list, text_keywords_list, code_print_list):
        code_flag = False
        count = 0
        cn_num = getnum_of_cn(line)
        en_num = getnum_of_en(line)
        cn_and_symbol_num = getnum_of_cn_and_symbol(line)
        for symbol in feature_list:
            if is_sub_string(line, symbol):
                code_flag = True
                for annotation in annotation_list:
                    if is_sub_string(line, annotation):
                        code_flag = True
                        break
                    else:
                        for text_keyword in text_keywords_list:
                            if is_sub_string(line, text_keyword):
                                code_flag = False
                                break
                        if cn_and_symbol_num > 30:
                            code_flag = False

                        if cn_and_symbol_num > 0:
                            for print_keyword in code_print_list:
                                if is_sub_string(line, print_keyword):
                                    code_flag = True
                                    break
                                else:
                                    if re.search(r'[\=\:]\s*?[\'\"].*?[\'\"]', line):
                                        code_flag = True
                                    else:
                                        code_flag = False
        if is_line_num(line):
            code_flag = False

        return code_flag

    def extract_code_fragment(self, text):
        text_list = []
        if type(text) == type(''):
            text_list = text.split('\n')
        elif type(text) == type([]):
            text_list = text
        else:
            pass
        code_flags = []
        #python comments
        annotation_idx_0 = extract_py_annotation_idx(text_list)
        annotation_idx_1 = extract_py_annotation_idx(text_list, annotation= "\"\"\"")
        #cpp comments
        annotation_idx_cpp = extract_cpp_annotation_idx(text_list)
        for line in text_list:
            code_flag = self.is_code_new(line, code_features, code_annotation_list, text_keywords_list, code_print_list)
            code_flags.append(code_flag)      
        repalce_annotation_idx(annotation_idx_0, code_flags)

        repalce_annotation_idx(annotation_idx_1, code_flags)

        repalce_annotation_idx(annotation_idx_cpp, code_flags)

        res = []
        temstr = ''
        tem = []
        for idx, flag in enumerate(code_flags):
            assert len(code_flags) == len(text_list)
            if flag == True:
                tem.append(text_list[idx] + '\n')
            else:
                temstr = ''.join(tem)
                res.append(temstr)
                tem = []
                temstr = ''
        res.append(''.join(tem))
        result = []
        for item in res:
            c_num = getnum_of_cn(item) + getnum_of_en_noblank(item)
            if c_num != 0:
                result.append(item)
        return result

def extract_markdown(data_list):
    """extract markdown Code in, i.e"```"Content in
    Args:
        data_list: Text list
    Returns:
        codes: Extracted code snippet(list)
        texts: Extracted text(string type)
    """
    idxs = []
    count = 0
    for idx, line in enumerate(data_list):
        if "```" in line:
            idxs.append(idx)
            count += 1
    if count < 2:
        return [], data_list
    else:
        try:

            assert (len(idxs) % 2) == 0
            codes = []
            texts = []
            for idx, val in enumerate(idxs):
                if idx % 2 == 0:
                    if idx == 0:
                        text = data_list[:idxs[idx]]
                    else:
                        text = data_list[idxs[idx - 1] + 1: idxs[idx]]

                    code = data_list[idxs[idx] + 1: idxs[idx + 1]]
                    code = '\n'.join(code)
                    text = '\n'.join(text)
                    codes.append(code)
                    texts.append(text)
                if idx == len(idxs) - 1:
                    text = data_list[idxs[idx] + 1:]
                    text = '\n'.join(text)
                    texts.append(text)
            texts = ''.join(texts)
        except:
            return [], data_list
        return codes, texts

def remove_short_codelist(code_list):
    result = []
    for item in code_list:
        itemlist = item.split('\n')
        if len(itemlist) < 3:
            item = []
        result.append(item)
    for i in result:
        if i == []:
            result.remove(i)
    return result

def extract_code(question):
    text = question
    extract_code = SimpleCodeExtract()
    codes = []
    (codes, text_lines) = filter_html_and_extract_code(text)
    markdown_codes, markdown_text = extract_markdown(text_lines)
    for code in markdown_codes:
        codes.append(code)
    rest_codes = extract_code.extract_code_fragment(markdown_text)
    rest_codes = remove_short_codelist(rest_codes)
    for code in rest_codes:
        codes.append(code)

    return codes

test

test = """
for item in testdata:
    result = extract_code(item)
    result_200.append(''.join(result))
"""
(codes, text_lines) = filter_html_and_extract_code(test)
markdown_codes, markdown_text = extract_markdown(text_lines)
code_list = extract_code(test)
print(extract_code(test))

#output
#["for item in testdata:\n\tresult = extract_code(item)\n\tresult_200.append(''.join(result))\n"]

summary

1. After more than 20 days of efforts, the effect of code extraction service is finally improved to 90 +%, which is a good start.
2. A person may walk fast, but a group of people can go further
3. Thank you for meeting such a good team 🌹🌹🌹
4. Come on

Programmer Think