w2v_stem_text

[1]:
import json
from tqdm import tqdm

def load_items():
    with open("../../../data/OpenLUNA.json", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)

from EduNLP.Pretrain import train_vector, GensimWordTokenizer

tokenizer = GensimWordTokenizer(symbol="fgm")

sif_items = [
    tokenizer(item["stem"]).tokens for item in tqdm(load_items(), "sifing")
]

sif_items[0]
d:\env\python3.8\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
sifing: 792it [00:00, 845.20it/s]
[1]:
['已知', '集合', '[FORMULA]', '[FORMULA]']
[2]:
len(sif_items)
[2]:
792
[3]:
# 100 dimension with skipgram method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_t_", 100)
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_sg_100.kv
[3]:
'../../../data/w2v/gensim_luna_stem_t_sg_100.kv'
[4]:
# 50 dimension with cbow method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_t_", 50, method="cbow")
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_cbow_50.kv
[4]:
'../../../data/w2v/gensim_luna_stem_t_cbow_50.kv'
[5]:
# 10 dimension with fasstext method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_t_", 10, method="fasttext")
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin
[5]:
'../../../data/w2v/gensim_luna_stem_t_fasttext_10.bin'
[6]:
from EduNLP.Vector import W2V

w2v = W2V("../../../data/w2v/gensim_luna_stem_t_sg_100.kv")
w2v["[FORMULA]"]
[6]:
array([-0.16754825,  0.2707899 ,  0.01005908, -0.03040857,  0.10938002,
       -0.28348687,  0.19054936,  0.41737646, -0.3885515 , -0.14650987,
        0.1157743 , -0.2406684 , -0.11294927,  0.12082661,  0.1759571 ,
        0.17807944,  0.07178611, -0.16182491, -0.18266837, -0.52223957,
       -0.05876796,  0.0450548 ,  0.26906556,  0.02253102,  0.1025768 ,
        0.29827935, -0.441235  , -0.06949052, -0.22638813, -0.10846554,
       -0.05917242,  0.12802479,  0.21151058, -0.4611071 , -0.16157094,
        0.32488874,  0.36630565, -0.36908495,  0.24223483, -0.3510737 ,
       -0.15079798,  0.10832163,  0.00392658, -0.20019084,  0.18827583,
       -0.17247967, -0.27385622,  0.17878376,  0.05156241,  0.30575123,
       -0.16626868,  0.01431947,  0.05540735,  0.03373449,  0.36685058,
       -0.05511234,  0.09583379, -0.09495933,  0.01121055,  0.18113017,
        0.29060405,  0.06472825,  0.20568778, -0.02780204, -0.17310621,
        0.23243082,  0.2480153 ,  0.07856195, -0.03825858,  0.10257348,
       -0.02105796,  0.4248383 ,  0.03114873, -0.09995517,  0.16022007,
        0.08843125,  0.06128069, -0.03922344,  0.02587396,  0.03067247,
        0.1209543 , -0.05948736, -0.25567266,  0.53167033, -0.4149    ,
        0.08551055,  0.42399153,  0.18317291,  0.12455773, -0.10759205,
        0.17496923,  0.2781072 ,  0.25744784,  0.1921185 ,  0.43071204,
        0.09138201, -0.37603223, -0.07436363,  0.2961049 ,  0.02517671],
      dtype=float32)