Source code for EduNLP.Formula.ast.ast

# coding: utf-8
# 2021/5/20 @ tongshiwei
from typing import List, Dict
from .katex import katex


__all__ = ["str2ast", "get_edges", "ast", "link_variable", "katex_parse"]


def katex_parse(formula):
    return katex.katex.__parse(formula,{'displayMode':True,'trust': True}).to_list()


def str2ast(formula: str, *args, **kwargs):
    return ast(formula, is_str=True, *args, **kwargs)


[docs]def ast(formula: (str, List[Dict]), index=0, forest_begin=0, father_tree=None, is_str=False): """ The origin code author is https://github.com/hxwujinze Parameters ---------- formula: str or List[Dict] 公式字符串或通过katex解析得到的结构体 index: int 本子树在树上的位置 forest_begin: int 本树在森林中的起始位置 father_tree: List[Dict] 父亲树 is_str: bool Returns ---------- tree: List[Dict] 重新解析形成的特征树 todo: finish all types Notes ---------- Some functions are not supportd in ``katex`` e.g., 1. tag - ``\\begin{equation} \\tag{tagName} F=ma \\end{equation}`` - ``\\begin{align} \\tag{1} y=x+z \\end{align}`` - ``\\tag*{hi} x+y^{2x}`` 2. dddot - ``\\frac{ \\dddot y }{ x }`` For more information, refer to `katex support table <https://github.com/KaTeX/KaTeX/blob/master/docs/support_table.md>`_ """ tree = [] index += forest_begin json_ast: List[Dict] = katex.katex.__parse(formula,{'displayMode':True,'trust': True}).to_list() if is_str else formula last_node = None for item in json_ast: private_index = len(tree) role = None if 'role' in item: role = item['role'] tree_node = { 'val': {'id': private_index + index, 'type': None, 'text': None, 'role': role}, 'structure': {'bro': [None, None], 'child': None, 'father': None, 'forest': None} } tree_node['val']['type'] = item['type'] if index > forest_begin: tree_node['structure']['father'] = index - 1 if tree_node['val']['type'] == "mathord" or tree_node['val']['type'] == "textord": tree_node['val']['text'] = item['text'].replace('\\prime', '’') tree.append(tree_node) elif tree_node['val']['type'] == "atom": tree_node['val']['text'] = item['text'].replace('\\cdotp', '·') tree_node['val']['type'] = item['family'] tree.append(tree_node) elif tree_node['val']['type'] == "op": tree_node['val']['text'] = "\\op" if 'name' not in item else item['name'] if item['symbol'] and 'body' in item: tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) else: tree_node['val']['text'] = item['name'] tree.append(tree_node) elif tree_node['val']['type'] == "genfrac": item['numer']['role'] = 'numer' item['denom']['role'] = 'denom' tree_node['val']['text'] = '\\frac' tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast([item['numer'], item['denom']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == 'sqrt': tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = '\\sqrt' tree.append(tree_node) item['body']['role'] = 'body' if item['index']: item['index']['role'] = 'index' tree += ast([item['body'], item['index']], index=len(tree) + index, father_tree=tree) else: tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == 'array': tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['arraydims'] = item['arraystretch'] tree_node['val']['text'] = '\\begin {matrix} \\end {matrix}' tree.append(tree_node) bodys = [] for litem in item['body']: for citem in litem: citem['role'] = 'body' bodys.append(citem) tree += ast(bodys, index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == 'styling': tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) for citem in item['body']: citem['role'] = 'body' tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == 'xArrow': tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = item['label'] tree.append(tree_node) item['body']['role'] = 'body' item['below']['role'] = 'below' tree += ast([item['body'], item['below']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == 'overline': tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = '\\' + tree_node['val']['type'] tree.append(tree_node) item['body']['role'] = 'body' tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "accent": tree_node['val']['text'] = item['label'] tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) item['base']['role'] = 'base' tree += ast([item['base']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "supsub": _tree = [] if 'base' in item and item['base'] is not None: item['base']['role'] = 'base' _tree.append(item['base']) if 'sub' in item and item['sub']: item['sub']['role'] = 'sub' _tree.append(item['sub']) if 'sup' in item and item['sup']: item['sup']['role'] = 'sup' _tree.append(item['sup']) tree_node['val']['text'] = "\\supsub" if _tree != []: tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast(_tree, index=len(tree) + index, father_tree=tree) else: tree.append(tree_node) elif tree_node['val']['type'] == "ordgroup": tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = '{ }' tree.append(tree_node) for citem in item['body']: citem['role'] = 'body' tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "mclass": tree_node['val']['text'] = item['mclass'] for citem in item['body']: citem['role'] = 'body' tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == 'leftright': tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = item['left'] tree_node['val']['right'] = item['right'] tree.append(tree_node) for citem in item['body']: citem['role'] = 'body' tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] in {"kern"}: # \quad tree_node['val']['text'] = tree_node['val']['type'] tree_node['val']['type'] = "ignore" tree.append(tree_node) elif tree_node['val']['type'] == "text": # \text{} tree_node['val']['text'] = "".join([e['text'] for e in item["body"]]) tree.append(tree_node) # --------------------- new node --------------------- # elif tree_node['val']['type'] == "size": # nknown usage : different from "sizing" continue elif tree_node['val']['type'] == "internal": # unknown usage continue elif tree_node['val']['type'] == "cr": # new line continue elif tree_node['val']['type'] == "infix": continue elif tree_node['val']['type'] == "rule": # ignore layout setting continue elif tree_node['val']['type'] == "cdlabel": tree_node['val']['text'] = item['side'] tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) item['label']['role'] = 'label' tree += ast([item['label']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "cdlabelparent": tree_node['val']['text'] = "\\cdlabelparent" tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) item['fragment']['role'] = 'fragment' tree += ast([item['fragment']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "color": tree_node['val']['text'] = "\\color" tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "color-token": tree_node['val']['text'] = "\\color-token" tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "raw": tree_node['val']['text'] = item['string'] tree.append(tree_node) elif tree_node['val']['type'] == "styling": # to be confirmed tree_node['val']['text'] = "\\styling" # tree_node['val']['text'] = item["style"] != None ? item["style"]: "\\styling" tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "tag": continue # not supported in Katex yet # tree_node['structure']['child'] = [1 + private_index + index] # tree_node['val']['text'] = '\\tag' # equations with order number # tree.append(tree_node) # body_item = {'type':'nodelist','role': 'body','body': item['body']} # tag_item = {'type':'nodelist','role': 'tag','body': item['tag']} # tree += ast([body_item, tag_item], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "verb": tree_node['val']['text'] = item['body'] # "original copy", source code tree.append(tree_node) elif tree_node['val']['type'] in ["spacing","accent-token","op-token"]: tree_node['val']['text'] = item['text'] tree.append(tree_node) elif tree_node['val']['type'] in ["accent","accentUnder"]: tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = item["label"] tree.append(tree_node) item['base']['role'] = 'base' tree += ast([item['base']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "delimsizing": # contains symbols for size settings, including "(",")", etc tree_node['val']['text'] = item['delim'] tree.append(tree_node) elif tree_node['val']['type'] == "enclose": # setting deleting line effect tree_node['val']['text'] = item['label'] tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "environment": tree_node['val']['text'] = item['name'] tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) item['nameGroup']['role'] = 'nameGroup' tree += ast([item['nameGroup']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "url": # continue tree_node['val']['text'] = item['url'] tree.append(tree_node) elif tree_node['val']['type'] == "href": # continue tree_node['val']['text'] = item['href'] tree_node['structure']['child'] = [1 + private_index + index] tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "html": # continue tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\html" tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "htmlmathml": # continue tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\htmlmathml" tree.append(tree_node) html_item = {'type':'nodelist','role': 'html','body': item['html']} # ? mathml_item = {'type':'nodelist','role': 'mathml','body': item['mathml']} # ? tree += ast([html_item,mathml_item], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "includegraphics": # continue tree_node['val']['text'] = item['src'] tree.append(tree_node) elif tree_node['val']['type'] == "font": tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = item["font"] # font name tree.append(tree_node) item['body']['role'] = 'body' tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "hbox": tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = '\\hbox' # box layout tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "vcenter": tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = '\\vcenter' # box layout tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "horizBrace": tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = item['label'] tree.append(tree_node) item['base']['role'] = 'base' tree += ast([item['base']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "lap": # layout setting (overlap) tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = item["alignment"] # methods of overlap (llap | rlap) tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "sizing": # consider ignoring size tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\sizing" tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "smash": # layout setting : smash (height | width) tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\smash" tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "mathchoice": # provides content that is dependent on the current style (display, text, script, or scriptscript). # eg: \mathchoice {#1}{#2}{#3}{#4} tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\mathchoice" tree.append(tree_node) mathchoiceList = [] for choice in ["display","text","script","scriptscript"]: citem = {'type':'nodelist','role':choice, 'body':item[choice]} mathchoiceList.append(citem) tree += ast(mathchoiceList, index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "operatorname": # unknown usage tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\operatorname" tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] in ["overline","underline"]: # consider ignoring line tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\" + tree_node['val']['type'] tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "raisebox": # raise or lower the height of the text tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\raisebox" tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == "leftright-right": # paired with leftright tree_node['val']['text'] = item["delim"] tree.append(tree_node) elif tree_node['val']['type'] == "middle": # symbols with height setting, such as "|" tree_node['val']['text'] = item["delim"] tree.append(tree_node) elif tree_node['val']['type'] in ["phantom","hphantom","vphantom"]: # set space distance by the length of content # continue tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\" + tree_node['val']['type'] tree.append(tree_node) tree += ast([item['body']], index=len(tree) + index, father_tree=tree) elif tree_node['val']['type'] == 'nodelist': # process node list specially tree_node['structure']['child'] = [1 + private_index + index] tree_node['val']['text'] = "\\" + item["role"] tree.append(tree_node) tree += ast(item['body'], index=len(tree) + index, father_tree=tree) else: tree_node['structure']['child'] = [1 + private_index + index] if "text" in item: tree_node['val']['text'] = item["text"] else: tree_node['val']['text'] = item["type"] tree_node['val']['type'] = "other" tree.append(tree_node) Role = ['body', 'base', 'sup', 'sub', 'numer', 'denom', 'index', 'below','nameGroup','fragment','label', 'other'] childrole = [] for role_item in Role: if role_item in item: if role_item == "body" and isinstance(item[role_item], dict) is False: # \text{} childrole.extend(item[role_item]) else: item[role_item]['role'] = role_item childrole.append(item[role_item]) tree += ast(childrole, index=len(tree) + index, father_tree=tree) if item: if item != json_ast[0]: tree[private_index]['structure']['bro'][0] = last_node + index last_node = private_index else: last_node = private_index if item != json_ast[-1]: tree[private_index]['structure']['bro'][1] = len(tree) + index if father_tree: father_tree[tree[private_index]['structure']['father'] - index]['structure']['child'].append( len(tree) + index ) return tree
[docs]def get_edges(forest): """ 构造边集合 Parameters ---------- forest: List[Dict] 森林 Returns ---------- edges: list of tuple(src,dst,type) 边集合 """ edges = [] for node in forest: index = node["val"]["id"] edges.append((index, index, 1)) if node['structure']['bro'][1] is not None: edges.append((index, node['structure']['bro'][1], 2)) if node['structure']['bro'][0] is not None: edges.append((index, node['structure']['bro'][0], 2)) if node['structure']['child'] is not None: for item in node['structure']['child']: edges.append((index, item, 3)) if node['structure']['father'] is not None: edges.append((index, node['structure']['father'], 4)) if node['structure']['forest'] is not None: for item in node['structure']['forest']: edges.append((index, item, 5)) return edges