w2v_stem_tf

[1]:
import json
from tqdm import tqdm

def load_items():
    with open("../../../data/OpenLUNA.json", encoding="utf-8") as f:
        for line in f:
            yield json.loads(line)


from EduNLP.Pretrain import GensimWordTokenizer

tokenizer = GensimWordTokenizer(symbol="gm")
sif_items = []
for item in tqdm(load_items(), "sifing"):
    sif_item = tokenizer(
        item["stem"]
    )
    if sif_item:
        sif_items.append(sif_item.tokens)

sif_items[0]
d:\env\python3.8\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
sifing: 792it [00:42, 18.83it/s]
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "①" (9312) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "②" (9313) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "③" (9314) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "④" (9315) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "二" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "项" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "式" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unrecognized Unicode character "﹣" (65123) [unknownSymbol]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ":" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "所" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "成" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "角" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "的" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "正" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "弦" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "值" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "为" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "则" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "四" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "点" used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "(" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ")" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
"LaTeX-incompatible input and strict mode is set to 'warn': Too few columns specified in the {array} column argument. [textEnv]"
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "," used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character ">" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
'LaTeX-incompatible input and strict mode is set to \'warn\': Unicode text character "<" used in math mode [unicodeTextInMathMode]'
[1]:
['埃及',
 '胡夫',
 '金字塔',
 '古代',
 '世界',
 '建筑',
 '奇迹',
 '形状',
 '视为',
 '正四',
 '棱锥',
 '以该',
 '四',
 '棱锥',
 '高为',
 '边长',
 '正方形',
 '面积',
 '等于',
 '四',
 '棱锥',
 '侧面',
 '三角形',
 '面积',
 '侧面',
 '三角形',
 '底边',
 '高',
 '底面',
 '正方形',
 '边长',
 '比值',
 '[FIGURE]',
 '[FIGURE]',
 '[FIGURE]',
 '[FIGURE]',
 '[FIGURE]']
[3]:
len(sif_items)
[3]:
792
[4]:
from EduNLP.Pretrain import train_vector
[5]:
# 100 dimension with skipgram method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 100)
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_tf_sg_100.kv
[5]:
'../../../data/w2v/gensim_luna_stem_tf_sg_100.kv'
[6]:
# 50 dimension with cbow method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 50, method="cbow")
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_tf_cbow_50.kv
[6]:
'../../../data/w2v/gensim_luna_stem_tf_cbow_50.kv'
[7]:
# 10 dimension with fasstext method
train_vector(sif_items, "../../../data/w2v/gensim_luna_stem_tf_", 10, method="fasttext")
EduNLP, INFO Epoch #0: loss-0.0000
EduNLP, INFO Epoch #1: loss-0.0000
EduNLP, INFO Epoch #2: loss-0.0000
EduNLP, INFO Epoch #3: loss-0.0000
EduNLP, INFO Epoch #4: loss-0.0000
EduNLP, INFO model is saved to ../../../data/w2v/gensim_luna_stem_tf_fasttext_10.bin
[7]:
'../../../data/w2v/gensim_luna_stem_tf_fasttext_10.bin'
[8]:
from EduNLP.Vector import W2V

w2v = W2V("../../../data/w2v/gensim_luna_stem_tf_fasttext_10.bin", method="fasttext")
w2v["[FIGURE]"]
[8]:
array([ 0.3322667 , -0.701586  , -0.6528301 , -0.02556002,  0.44070247,
        0.44261315,  0.54466563,  0.8991576 , -1.0600986 ,  0.19438864],
      dtype=float32)