d2v_stem_tf¶
[1]:
import json
from tqdm import tqdm
def load_items():
with open("../../../data/OpenLUNA.json", encoding="utf-8") as f:
for line in f:
yield json.loads(line)
from EduNLP.Pretrain import GensimWordTokenizer
tokenizer = GensimWordTokenizer(symbol="gm")
sif_items = []
for item in tqdm(load_items(), "sifing"):
sif_item = tokenizer(
item["stem"]
)
if sif_item:
sif_items.append(sif_item.tokens)
sif_items[0]
d:\env\python3.8\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
sifing: 792it [00:39, 19.82it/s]
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "①" (9312) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "②" (9313) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "③" (9314) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "④" (9315) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "二" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "项" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "式" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "所" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "成" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "角" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "的" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "正" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "弦" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "值" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "为" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "四" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "点" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
[1]:
['已知',
'集合',
'mathord',
'=',
'mathord',
'\\mid',
'mathord',
'textord',
'{ }',
'^',
'-',
'textord',
'mathord',
'-',
'textord',
'<',
'textord',
'\\{',
',',
'mathord',
'=',
'\\{',
'-',
'textord',
',',
'textord',
',',
'textord',
',',
'textord',
'\\}',
',',
'mathord',
'\\cap',
'mathord',
'=']
[2]:
len(sif_items)
[2]:
788
[3]:
from EduNLP.Pretrain import train_vector
[4]:
# 10 dimension with fasstext method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="d2v")
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO Epoch #5: loss-0.0000
EduNLP, INFO Epoch #6: loss-0.0000
EduNLP, INFO Epoch #7: loss-0.0000
EduNLP, INFO Epoch #8: loss-0.0000
EduNLP, INFO Epoch #9: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin
[4]:
'../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin'
[5]:
from EduNLP.Vector import D2V
d2v = D2V("../../../data/w2v/gensim_luna_stem_tf_d2v_10.bin")
d2v(sif_items[0])
[5]:
array([-0.16680606, -0.04633714, 0.05006265, 0.2665265 , -0.04968905,
0.00479913, 0.0338746 , 0.04415674, -0.22469515, -0.00274831],
dtype=float32)