preface
This paper is based on the previous article Code extraction function optimization (I) Some optimizations have been made, the processing of python multiline annotation, C + + multiline annotation and markdown has been added, some small bug s have been fixed, and the extraction accuracy has been improved to 92%.
1, Code extraction and construction ideas
2, Introduction to each module
1. Extract the code in markdown
Idea:
1. Judge whether ` ` ` is in this line, count+1 if it is, and take out the line number of this line
2. After obtaining the line number, take out the contents between the line numbers from the original list
The whole needs to look at the code to better understand
def extract_markdown(data_list): """extract markdown Code in, i.e"```"Content in Args: data_list: Text list Returns: codes: Extracted code snippet(list) texts: Extracted text(string type) """ idxs = [] count = 0 for idx, line in enumerate(data_list): if "```" in line: idxs.append(idx) count += 1 if count < 2: return [], data_list else: try: assert (len(idxs) % 2) == 0 codes = [] texts = [] for idx, val in enumerate(idxs): if idx % 2 == 0: if idx == 0: text = data_list[:idxs[idx]] else: text = data_list[idxs[idx - 1] + 1: idxs[idx]] code = data_list[idxs[idx] + 1: idxs[idx + 1]] code = '\n'.join(code) text = '\n'.join(text) codes.append(code) texts.append(text) if idx == len(idxs) - 1: text = data_list[idxs[idx] + 1:] text = '\n'.join(text) texts.append(text) texts = ''.join(texts) except: return [], data_list return codes, texts
2. Extract the index of python multiline annotation
The overall idea is similar to extracting the code in markdown. I won't repeat it here. Just look at the code:
def extract_py_annotation_idx(data_list, annotation= "'''"): """extract python Multiline comment content Args: data_list: Text list Returns: idx: Index returned """ idxs = [] count = 0 for idx, line in enumerate(data_list): if annotation in line: idxs.append(idx) count += 1 if (count % 2) == 1: return [] else: assert (len(idxs) % 2) == 0 return idxs
3. Extract C + + multiline comments
This is slightly different from the previous two, but the overall idea is the same
def extract_cpp_annotation_idx(data_list): """extract/* */Multiline comment line index in Args: data_list: Text list Returns: idx: Annotation index list """ idxs = [] count = 0 for idx, line in enumerate(data_list): if line.find('/*') != -1 and line.find('*/') != -1: continue elif line.find('/*') != -1: count += 1 idxs.append(idx) elif line.find('*/') != -1: count += 1 idxs.append(idx) if (count % 2) == 1: return [] else: return idxs
No matter what kind of code is multi line annotation, you should pay attention to one thing: these annotators must exist in pairs!
4. Code judgment logic
1. Enter line to judge each line of code. If yes, the code returns True; otherwise, it returns False.
2. For the incoming line, first directly determine it as False, and then set it to True after meeting certain conditions.
code_symbol_list = ['~', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+',',',':', '=', '/', '|', ';', '{', '}', '[', ']', '<', '>', '~','?'] code_print_list = ['print_r', 'var_dump', 'NSLog', 'println', 'console.log', 'print', 'printf', 'cout', 'Console.WriteLine', 'Println', 'Response.Write', 'alert', 'echo'] code_annotation_list = [r'#', r'//', r'<!--', r'-->', r'/*', r'*/', r"'''", r'--', r'%'] common_lib_list = ['import', 'from', 'yield', 'public', 'class', 'if', 'while', 'var', 'typedef', 'struct', 'package', 'Dim', 'input'] def_str_list = ['char', '#define', 'string', 'String', 'concat', 'findall'] text_keywords_list = ['Reprinted in', 'What's the reason?', 'error', 'ERROR', 'Error', ':\\', 'report errors', '?', '!', 'Why?', '[img]', 'caption'] code_print_list.extend(def_str_list) code_features = [] # code_features.extend(pl_keywords_no_intersect_list) code_features.extend(code_symbol_list) code_features.extend(common_lib_list) code_features.extend(code_print_list) code_features.extend(code_annotation_list) def is_code_new(self, line, feature_list, annotation_list, text_keywords_list, code_print_list): code_flag = False #Set to False first, and then determine to be True after meeting certain conditions # count = 0 # cn_num = getnum_of_cn(line) # en_num = getnum_of_en(line) cn_and_symbol_num = getnum_of_cn_and_symbol(line) # Calculate the number of Chinese characters and Chinese punctuation marks for symbol in feature_list: # Traversal code keyword if is_sub_string(line, symbol): # Judge whether the line contains code keywords code_flag = True #Set to True if included for annotation in annotation_list: #Iterate over the annotations of various codes if is_sub_string(line, annotation): #Determine whether annotations are included code_flag = True #Include the comment character, set it to True, and jump out of the comment traversal break else: #Does not contain comments for text_keyword in text_keywords_list: #Traverse text keywords. If text keywords are included, set to False if is_sub_string(line, text_keyword): code_flag = False break if cn_and_symbol_num > 30: # Set to False if no comments are included and the number of Chinese characters is greater than 30 # Considering that more than 30 Chinese characters may also be print, this code must be placed before judging the print condition code_flag = False # If there are no comments and the number of Chinese characters is greater than 0, consider whether it is print output # Set to True if it is the output of print if cn_and_symbol_num > 0: for print_keyword in code_print_list: if is_sub_string(line, print_keyword): code_flag = True break else: #If it is not print, consider whether it is a defined dictionary # For example, the dictionary defined by python has this situation: # dict = { # 'name': 'Zhang San', # 'gender': 'male' # } #Set to True if it is similar to the above if re.search(r'[\=\:]\s*?[\'\"].*?[\'\"]', line): code_flag = True else: code_flag = False # Considering that some error reports can also be misjudged as codes, the following error reports are excluded # Trace back in line xxx # Where xxx can be any number # Set to False as long as the line contains line xxx if is_line_num(line): code_flag = False return code_flag
5. Subsequent optimization
1. Although the accuracy rate has been 90 +%, there are only 400 test data, and some situations may be missed, such as SQL code, 51 assembly code and some other key fields. In the future, we will focus on these points to optimize and strive to improve the effect to 95 +%.
2. Considered adding reserved words in all languages to code_feature_list, but in this way, although the accuracy is improved, the code execution efficiency will be greatly reduced. As a basic module, the code extraction service must take into account the efficiency and accuracy. Therefore, there must be a balance between the two.
3, All codes
import pandas as pd import re from html.parser import HTMLParser from bs4 import BeautifulSoup code_symbol_list = ['~', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+',',',':', '=', '/', '|', ';', '{', '}', '[', ']', '<', '>', '~','?'] code_print_list = ['print_r', 'var_dump', 'NSLog', 'println', 'console.log', 'print', 'printf', 'cout', 'Console.WriteLine', 'Println', 'Response.Write', 'alert', 'echo'] code_annotation_list = [r'#', r'//', r'<!--', r'-->', r'/*', r'*/', r"'''", r'--', r'%'] code_keywords_list = ['import', 'from', 'yield', 'public', 'class', 'if', 'while', 'var', 'typedef', 'struct', 'package', 'Dim', 'input', 'return', 'else', 'auto'] def_str_list = ['char', '#define', 'string', 'String', 'concat', 'findall'] text_keywords_list = ['Reprinted in', 'What's the reason?', 'error', 'ERROR', 'Error', ':\\', 'report errors', '?', '!', 'Why?', '[img]', 'caption'] code_print_list.extend(def_str_list) code_features = [] # code_features.extend(pl_keywords_no_intersect_list) code_features.extend(code_symbol_list) code_features.extend(code_keywords_list) code_features.extend(code_print_list) code_features.extend(code_annotation_list) def is_line_num(inputdata): '''lookup line xxx In string''' if re.search(r'line [0-9]{1,}', inputdata): return True else: return False def tab_2_spaces(text): return text.replace(' ', '\t') def pretty_lines(texts): text_lines = [] for text in texts: if text is None: continue sub_lines = text.split('\r\n') for sub_line in sub_lines: inner_lines = [tab_2_spaces(l) for l in sub_line.split( '\n') if l.strip() != ''] text_lines += inner_lines return text_lines def pretty_one_line(texts): text_lines = [] for text in texts: sub_lines = text.split('\r\n') for sub_line in sub_lines: inner_lines = [l for l in sub_line.split( '\n') if l.strip() != ''] text_lines += inner_lines return ''.join(text_lines) class CodeHTMLParser(HTMLParser): '''filter HTML Inside code Code snippet,''' def __init__(self): HTMLParser.__init__(self) self.code_tag_count = 0 self.current_code = [] self.codes = [] self.texts = [] self.last_text = None def handle_starttag(self, tag, attrs): # print('tag:', tag, attrs) if tag == 'code': self.code_tag_count += 1 if self.code_tag_count == 0: self.handle_text_tag(tag, attrs) def handle_text_tag(self, tag, attrs): if self.last_text is None: return index = self.last_text.find('#include') if index == -1: return if self.last_text[index:].strip() != '#include': return cpp_tag = ' <{}>'.format(tag) index = len(self.texts)-1 if index >= 0: pre = pretty_one_line(self.texts[index]) self.texts[index] = pre + cpp_tag def handle_endtag(self, tag): if tag == 'code': if self.code_tag_count == 1: self.codes.append('\n'.join(self.current_code)) self.current_code = [] self.code_tag_count -= 1 def handle_data(self, data): if self.code_tag_count > 0: self.current_code.append(data) elif data is not None: self.last_text = data self.texts.append(data) else: self.last_text = '' def filter_html_and_extract_code(text): '''remove html of tag Mark and extract at the same time<code>...</code>Code snippet within''' try: soup = BeautifulSoup(text, 'html.parser') codes = [tag.string for tag in soup.find_all('code')] prettify_text = soup.prettify() # print(prettify_text) parser = CodeHTMLParser() parser.feed(prettify_text) # Remove blank lines text_lines = pretty_lines(parser.texts) return (pretty_lines(codes), text_lines) except Exception as e: print('parse text as html failed, text:', str(e)) # print(traceback.format_exc()) return ([], pretty_lines([text])) def getnum_of_cn(inputdata): '''Calculate the number of Chinese characters in the string''' chi = re.findall(r'[\u4E00-\u9FFF]', inputdata) # Unicode coding range of Chinese characters return len(chi) def getnum_of_cn_and_symbol(inputdata): chi = re.findall(r'[\u4E00-\u9FFF]', inputdata) # Unicode coding range of Chinese characters result = len(chi) + inputdata.count(',') + inputdata.count('. ') + inputdata.count(';') return result def getnum_of_en_noblank(inputdata): char = re.findall(r'[a-zA-Z]', inputdata) # Returns all matching substrings in list type #other = len(inputdata) - len(char) - len(blank) return len(char) def getnum_of_en(inputdata): '''Calculates the sum of the number of letters and spaces in the string''' char = re.findall(r'[a-zA-Z]', inputdata) # Returns all matching substrings in list type blank = re.findall(r' ', inputdata) #other = len(inputdata) - len(char) - len(blank) return len(char) + len(blank) def is_sub_string(ori_string, sub_string): """judge ori_string Include sub_string Args: ori_string: character string sub_string: character string Returns: Include or not, include as True,Not included as False """ if ori_string.find(sub_string) != -1: return True else: return False def extract_py_annotation_idx(data_list, annotation= "'''"): """extract python Multiline comment content Args: data_list: Text list Returns: idx: Index returned """ idxs = [] count = 0 for idx, line in enumerate(data_list): if annotation in line: idxs.append(idx) count += 1 if (count % 2) == 1: return [] else: assert (len(idxs) % 2) == 0 return idxs def extract_cpp_annotation_idx(data_list): """extract/* */Comment line index in Args: data_list: Text list Returns: idx: Annotation index list """ idxs = [] count = 0 for idx, line in enumerate(data_list): if line.find('/*') != -1 and line.find('*/') != -1: continue elif line.find('/*') != -1: count += 1 idxs.append(idx) elif line.find('*/') != -1: count += 1 idxs.append(idx) if (count % 2) == 1: return [] else: return idxs def repalce_annotation_idx(annotation_idx, code_flags): if annotation_idx != []: for idx, val in enumerate(annotation_idx): if idx % 2 == 0: for i in range(annotation_idx[idx], annotation_idx[idx+1]+1): code_flags[i] = True class SimpleCodeExtract(object): def __init__(self): print() # logging.info('init code extract service') def is_code_new(self, line, feature_list, annotation_list, text_keywords_list, code_print_list): code_flag = False count = 0 cn_num = getnum_of_cn(line) en_num = getnum_of_en(line) cn_and_symbol_num = getnum_of_cn_and_symbol(line) for symbol in feature_list: if is_sub_string(line, symbol): code_flag = True for annotation in annotation_list: if is_sub_string(line, annotation): code_flag = True break else: for text_keyword in text_keywords_list: if is_sub_string(line, text_keyword): code_flag = False break if cn_and_symbol_num > 30: code_flag = False if cn_and_symbol_num > 0: for print_keyword in code_print_list: if is_sub_string(line, print_keyword): code_flag = True break else: if re.search(r'[\=\:]\s*?[\'\"].*?[\'\"]', line): code_flag = True else: code_flag = False if is_line_num(line): code_flag = False return code_flag def extract_code_fragment(self, text): text_list = [] if type(text) == type(''): text_list = text.split('\n') elif type(text) == type([]): text_list = text else: pass code_flags = [] #python comments annotation_idx_0 = extract_py_annotation_idx(text_list) annotation_idx_1 = extract_py_annotation_idx(text_list, annotation= "\"\"\"") #cpp comments annotation_idx_cpp = extract_cpp_annotation_idx(text_list) for line in text_list: code_flag = self.is_code_new(line, code_features, code_annotation_list, text_keywords_list, code_print_list) code_flags.append(code_flag) repalce_annotation_idx(annotation_idx_0, code_flags) repalce_annotation_idx(annotation_idx_1, code_flags) repalce_annotation_idx(annotation_idx_cpp, code_flags) res = [] temstr = '' tem = [] for idx, flag in enumerate(code_flags): assert len(code_flags) == len(text_list) if flag == True: tem.append(text_list[idx] + '\n') else: temstr = ''.join(tem) res.append(temstr) tem = [] temstr = '' res.append(''.join(tem)) result = [] for item in res: c_num = getnum_of_cn(item) + getnum_of_en_noblank(item) if c_num != 0: result.append(item) return result def extract_markdown(data_list): """extract markdown Code in, i.e"```"Content in Args: data_list: Text list Returns: codes: Extracted code snippet(list) texts: Extracted text(string type) """ idxs = [] count = 0 for idx, line in enumerate(data_list): if "```" in line: idxs.append(idx) count += 1 if count < 2: return [], data_list else: try: assert (len(idxs) % 2) == 0 codes = [] texts = [] for idx, val in enumerate(idxs): if idx % 2 == 0: if idx == 0: text = data_list[:idxs[idx]] else: text = data_list[idxs[idx - 1] + 1: idxs[idx]] code = data_list[idxs[idx] + 1: idxs[idx + 1]] code = '\n'.join(code) text = '\n'.join(text) codes.append(code) texts.append(text) if idx == len(idxs) - 1: text = data_list[idxs[idx] + 1:] text = '\n'.join(text) texts.append(text) texts = ''.join(texts) except: return [], data_list return codes, texts def remove_short_codelist(code_list): result = [] for item in code_list: itemlist = item.split('\n') if len(itemlist) < 3: item = [] result.append(item) for i in result: if i == []: result.remove(i) return result def extract_code(question): text = question extract_code = SimpleCodeExtract() codes = [] (codes, text_lines) = filter_html_and_extract_code(text) markdown_codes, markdown_text = extract_markdown(text_lines) for code in markdown_codes: codes.append(code) rest_codes = extract_code.extract_code_fragment(markdown_text) rest_codes = remove_short_codelist(rest_codes) for code in rest_codes: codes.append(code) return codes
test
test = """ for item in testdata: result = extract_code(item) result_200.append(''.join(result)) """ (codes, text_lines) = filter_html_and_extract_code(test) markdown_codes, markdown_text = extract_markdown(text_lines) code_list = extract_code(test) print(extract_code(test)) #output #["for item in testdata:\n\tresult = extract_code(item)\n\tresult_200.append(''.join(result))\n"]
summary
1. After more than 20 days of efforts, the effect of code extraction service is finally improved to 90 +%, which is a good start.
2. A person may walk fast, but a group of people can go further
3. Thank you for meeting such a good team 🌹🌹🌹
4. Come on