d2v_bow_tfidf¶
1. load and tokenize test_items¶
[1]:
from platform import processor
from gensim import corpora,models
# from collections import defaultdict
import json
from tqdm import tqdm
from EduNLP.Pretrain import GensimWordTokenizer,train_vector
from EduNLP.Vector import D2V
from EduNLP.SIF.segment import seg
from EduNLP.SIF.tokenization import tokenize
import time
output_file_head = "test" # subject = english | liberal | science |all
baseDir = "E:/Workustc/lunadata/d2v"
# baseDir = "/home/qlh/data_pretrain"
work_file_path = baseDir + "/data/" + output_file_head + "_raw.json"
test_items = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
{"ques_content":"Human machine interface for lab abc computer applications"},
{"ques_content": "A survey of user opinion of computer system response time"},
{"ques_content": "The EPS user interface management system"},
{"ques_content": "System and human system engineering testing of EPS"},
{"ques_content": "Relation of user perceived response time to error measurement"},
{"ques_content": "The generation of random binary unordered trees"},
{"ques_content": "The intersection graph of paths in trees"},
{"ques_content": "Graph minors IV Widths of trees and well quasi ordering"},
{"ques_content": "Graph minors A survey"}
]
def load_items():
for line in test_items:
yield line
# with open(work_file_path, 'r', encoding="utf-8") as f:
# for line in f:
# yield json.loads(line)
def data2Token():
# 线性分词,而不使用ast
tokenization_params = {
"formula_params": {
"method": "linear",
}
}
token_items = []
count = 1
for item in tqdm(load_items(), "sifing"):
count = count + 1
# -------------------------------------------- #
# """除文本、公式外,其他转化为特殊标记"""
# seg_ret = seg(item["ques_content"], symbol="gmas")
# token_item = tokenize(seg_ret, **tokenization_params)
tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
token_item = tokenizer(item["ques_content"])
# -------------------------------------------- #
if token_item:
# print("[i] = ", count)
# print("[tokens] = ", token_item)
token_items.append(token_item.tokens)
print("[data2Token] finish ========================> num = ",len(token_items))
return token_items
token_items = data2Token()
token_items[0]
D:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
sifing: 10it [00:00, 18.57it/s]
[data2Token] finish ========================> num = 10
[1]:
['公式',
'[FORMULA]',
'公式',
'[FORMULA]',
'如图',
'[FIGURE]',
'x',
',',
'y',
'约束条件',
'[SEP]',
'z',
'=',
'x',
'+',
'7',
'y',
'最大值',
'[MARK]']
[2]:
len(token_items[0])
[2]:
19
2. train and test model by ‘bow’¶
[3]:
from EduNLP.Pretrain import train_vector
#10 dimension with fasstext method
train_vector(token_items, "../../../data/d2v/gensim_luna_stem_tf_", method="bow")
EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_bow.bin
[3]:
'../../../data/d2v/gensim_luna_stem_tf_bow.bin'
[4]:
from EduNLP.Vector import D2V
d2v = D2V("../../../data/d2v/gensim_luna_stem_tf_bow.bin", method = "bow")
print(d2v(token_items[1]))
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
3. train and test model by ‘tfidf’¶
[5]:
from EduNLP.Pretrain import train_vector
#10 dimension with fasstext method
train_vector(token_items, "../../../data/d2v/gensim_luna_stem_tf_", method="tfidf")
EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_bow.bin
EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_tfidf.bin
[5]:
'../../../data/d2v/gensim_luna_stem_tf_tfidf.bin'
[6]:
from EduNLP.Vector import D2V
d2v = D2V("../../../data/d2v/gensim_luna_stem_tf_tfidf.bin", method = "tfidf")
vec_size = d2v.vector_size
print("vec_size = ", vec_size)
d2v(token_items[1])
vec_size = 63
[6]:
[(15, 0.37858374396389033),
(16, 0.37858374396389033),
(17, 0.37858374396389033),
(18, 0.2646186811599866),
(19, 0.37858374396389033),
(20, 0.2646186811599866),
(21, 0.37858374396389033),
(22, 0.37858374396389033)]