d2v_d2

[1]:
from tqdm import tqdm
import json
from EduNLP.utils import dict2str4sif

def load_items():
    with open("../../../data/OpenLUNA.json", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)

from EduNLP.Pretrain import GensimSegTokenizer

tokenizer = GensimSegTokenizer(depth=2)
sif_items = []
for item in tqdm(load_items(), "sifing"):
    keys = ["stem"]
    item["options"] = eval(item["options"])
    if item["options"]:
        keys.append("options")
    try:
        item_str = dict2str4sif(
            item,
            key_as_tag=True,
            add_list_no_tag=False,
            keys=keys,
            tag_mode="head"
        )
    except TypeError:
        continue
    sif_item = tokenizer(
        item_str
    )
    if sif_item:
        sif_items.append(sif_item)
d:\env\python3.8\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
sifing: 792it [00:54, 14.51it/s]
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "①" (9312) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "②" (9313) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "③" (9314) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "④" (9315) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "二" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "项" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "式" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "所" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "成" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "角" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "的" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "正" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "弦" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "值" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "为" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "四" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "点" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
[2]:
sif_items[1]
[2]:
[['\\SIFTag{stem}'],
 ['[TEXT_BEGIN]', '复数'],
 ['[FORMULA_BEGIN]',
  'mathord',
  '=',
  'textord',
  '+',
  'textord',
  'mathord',
  '+',
  'mathord',
  'textord',
  '{ }',
  '^'],
 ['[TEXT_BEGIN]'],
 ['[FORMULA_BEGIN]', 'textord', 'mathord', 'textord', '='],
 ['\\SIFTag{options}'],
 ['[TEXT_BEGIN]'],
 ['\\SIFSep'],
 ['[TEXT_BEGIN]'],
 ['\\SIFSep'],
 ['[FORMULA_BEGIN]', 'textord', '{ }', '\\sqrt'],
 ['\\SIFSep'],
 ['[TEXT_BEGIN]']]