d2v

[1]:
import warnings
from tqdm import tqdm
import json
from EduNLP.utils import dict2str4sif

def load_items():
    with open("../../../data/OpenLUNA.json", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)

from EduNLP.Pretrain import GensimSegTokenizer

tokenizer = GensimSegTokenizer(depth=None)
sif_items = []
for item in tqdm(load_items(), "sifing"):
    keys = ["stem"]
    item["options"] = eval(item["options"])
    if item["options"]:
        keys.append("options")
    try:
        item_str = dict2str4sif(
            item,
            key_as_tag=True,
            add_list_no_tag=False,
            keys=keys,
            tag_mode="head"
        )
    except TypeError:
        continue
    sif_item = tokenizer(
        item_str
    )
    if sif_item:
        sif_items.append(sif_item)
d:\env\python3.8\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
sifing: 792it [00:51, 15.27it/s]
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "①" (9312) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "②" (9313) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "③" (9314) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "④" (9315) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "二" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "项" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "式" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "所" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "成" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "角" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "的" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "正" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "弦" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "值" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "为" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "四" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "点" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
[2]:
sif_items[0]
[2]:
[['\\SIFTag{stem}'],
 ['已知', '集合'],
 ['mathord',
  '=',
  'mathord',
  '\\mid',
  'mathord',
  'textord',
  '{ }',
  '^',
  '-',
  'textord',
  'mathord',
  '-',
  'textord',
  '<',
  'textord',
  '\\{',
  ',',
  'mathord',
  '=',
  '\\{',
  '-',
  'textord',
  ',',
  'textord',
  ',',
  'textord',
  ',',
  'textord',
  '\\}',
  ','],
 ['mathord', '\\cap', 'mathord', '='],
 ['\\SIFTag{options}'],
 ['\\', '{', '\\'],
 ['\\', '{', '\\'],
 ['\\', '{', '\\'],
 ['\\', '{', '\\']]
[3]:
len(sif_items)
[3]:
770
[ ]:
from EduNLP.Pretrain import train_vector
from gensim.models.doc2vec import TaggedDocument

train_vector(
    sif_items,
    "../../../data/w2v/gensim_luna_stem_tf_",
    10
)