w2v_stem_text¶
[1]:
import json
from tqdm import tqdm
def load_items():
with open("../../../data/OpenLUNA.json", encoding="utf-8") as f:
for line in f:
yield json.loads(line)
from EduNLP.Pretrain import train_vector, GensimWordTokenizer
tokenizer = GensimWordTokenizer(symbol="fgm")
sif_items = [
tokenizer(item["stem"]).tokens for item in tqdm(load_items(), "sifing")
]
sif_items[0]
d:\env\python3.8\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
sifing: 792it [00:00, 845.20it/s]
[1]:
['已知', '集合', '[FORMULA]', '[FORMULA]']
[2]:
len(sif_items)
[2]:
792
[3]:
# 100 dimension with skipgram method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_t_", 100)
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_sg_100.kv
[3]:
'../../../data/w2v/gensim_luna_stem_t_sg_100.kv'
[4]:
# 50 dimension with cbow method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_t_", 50, method="cbow")
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_cbow_50.kv
[4]:
'../../../data/w2v/gensim_luna_stem_t_cbow_50.kv'
[5]:
# 10 dimension with fasstext method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_t_", 10, method="fasttext")
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin
[5]:
'../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin'
[6]:
from EduNLP.Vector import W2V
w2v = W2V("../../../data/w2v/gensim_luna_stem_t_sg_100.kv")
w2v["[FORMULA]"]
[6]:
array([-0.16754825, 0.2707899 , 0.01005908, -0.03040857, 0.10938002,
-0.28348687, 0.19054936, 0.41737646, -0.3885515 , -0.14650987,
0.1157743 , -0.2406684 , -0.11294927, 0.12082661, 0.1759571 ,
0.17807944, 0.07178611, -0.16182491, -0.18266837, -0.52223957,
-0.05876796, 0.0450548 , 0.26906556, 0.02253102, 0.1025768 ,
0.29827935, -0.441235 , -0.06949052, -0.22638813, -0.10846554,
-0.05917242, 0.12802479, 0.21151058, -0.4611071 , -0.16157094,
0.32488874, 0.36630565, -0.36908495, 0.24223483, -0.3510737 ,
-0.15079798, 0.10832163, 0.00392658, -0.20019084, 0.18827583,
-0.17247967, -0.27385622, 0.17878376, 0.05156241, 0.30575123,
-0.16626868, 0.01431947, 0.05540735, 0.03373449, 0.36685058,
-0.05511234, 0.09583379, -0.09495933, 0.01121055, 0.18113017,
0.29060405, 0.06472825, 0.20568778, -0.02780204, -0.17310621,
0.23243082, 0.2480153 , 0.07856195, -0.03825858, 0.10257348,
-0.02105796, 0.4248383 , 0.03114873, -0.09995517, 0.16022007,
0.08843125, 0.06128069, -0.03922344, 0.02587396, 0.03067247,
0.1209543 , -0.05948736, -0.25567266, 0.53167033, -0.4149 ,
0.08551055, 0.42399153, 0.18317291, 0.12455773, -0.10759205,
0.17496923, 0.2781072 , 0.25744784, 0.1921185 , 0.43071204,
0.09138201, -0.37603223, -0.07436363, 0.2961049 , 0.02517671],
dtype=float32)