d2v¶
[1]:
import warnings
from tqdm import tqdm
import json
from EduNLP.utils import dict2str4sif
def load_items():
with open("../../../data/OpenLUNA.json", encoding="utf-8") as f:
for line in f:
yield json.loads(line)
from EduNLP.Pretrain import GensimSegTokenizer
tokenizer = GensimSegTokenizer(depth=None)
sif_items = []
for item in tqdm(load_items(), "sifing"):
keys = ["stem"]
item["options"] = eval(item["options"])
if item["options"]:
keys.append("options")
try:
item_str = dict2str4sif(
item,
key_as_tag=True,
add_list_no_tag=False,
keys=keys,
tag_mode="head"
)
except TypeError:
continue
sif_item = tokenizer(
item_str
)
if sif_item:
sif_items.append(sif_item)
d:\env\python3.8\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
sifing: 792it [00:51, 15.27it/s]
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "①" (9312) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "②" (9313) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "③" (9314) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "④" (9315) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "二" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "项" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "式" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "所" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "成" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "角" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "的" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "正" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "弦" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "值" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "为" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "四" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "点" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
[2]:
sif_items[0]
[2]:
[['\\SIFTag{stem}'],
['已知', '集合'],
['mathord',
'=',
'mathord',
'\\mid',
'mathord',
'textord',
'{ }',
'^',
'-',
'textord',
'mathord',
'-',
'textord',
'<',
'textord',
'\\{',
',',
'mathord',
'=',
'\\{',
'-',
'textord',
',',
'textord',
',',
'textord',
',',
'textord',
'\\}',
','],
['mathord', '\\cap', 'mathord', '='],
['\\SIFTag{options}'],
['\\', '{', '\\'],
['\\', '{', '\\'],
['\\', '{', '\\'],
['\\', '{', '\\']]
[3]:
len(sif_items)
[3]:
770
[ ]:
from EduNLP.Pretrain import train_vector
from gensim.models.doc2vec import TaggedDocument
train_vector(
sif_items,
"../../../data/w2v/gensim_luna_stem_tf_",
10
)