d2v_bow_tfidf

1. load and tokenize test_items

[1]:
from platform import processor
from gensim import corpora,models
# from collections import defaultdict
import json
from tqdm import tqdm
from EduNLP.Pretrain import GensimWordTokenizer,train_vector
from EduNLP.Vector import D2V
from EduNLP.SIF.segment import seg
from EduNLP.SIF.tokenization import tokenize
import time

output_file_head = "test"  # subject = english | liberal | science |all
baseDir = "E:/Workustc/lunadata/d2v"
# baseDir = "/home/qlh/data_pretrain"
work_file_path = baseDir + "/data/" + output_file_head + "_raw.json"

test_items = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
             {"ques_content":"Human machine interface for lab abc computer applications"},
             {"ques_content": "A survey of user opinion of computer system response time"},
             {"ques_content": "The EPS user interface management system"},
             {"ques_content": "System and human system engineering testing of EPS"},
             {"ques_content": "Relation of user perceived response time to error measurement"},
             {"ques_content": "The generation of random binary unordered trees"},
             {"ques_content": "The intersection graph of paths in trees"},
             {"ques_content": "Graph minors IV Widths of trees and well quasi ordering"},
             {"ques_content": "Graph minors A survey"}
             ]

def load_items():
    for line in test_items:
        yield line
    # with open(work_file_path, 'r', encoding="utf-8") as f:
    #     for line in f:
    #         yield json.loads(line)

def data2Token():
    # 线性分词,而不使用ast
    tokenization_params = {
        "formula_params": {
            "method": "linear",
        }
    }

    token_items = []
    count = 1
    for item in tqdm(load_items(), "sifing"):
        count = count + 1
        # -------------------------------------------- #
        # """除文本、公式外,其他转化为特殊标记"""
        # seg_ret = seg(item["ques_content"], symbol="gmas")
        # token_item = tokenize(seg_ret, **tokenization_params)
        tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
        token_item = tokenizer(item["ques_content"])

        # -------------------------------------------- #
        if token_item:
            # print("[i] = ", count)
            # print("[tokens] = ", token_item)
            token_items.append(token_item.tokens)
    print("[data2Token] finish ========================> num = ",len(token_items))
    return token_items

token_items = data2Token()
token_items[0]
D:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
sifing: 10it [00:00, 18.57it/s]
[data2Token] finish ========================> num =  10

[1]:
['公式',
 '[FORMULA]',
 '公式',
 '[FORMULA]',
 '如图',
 '[FIGURE]',
 'x',
 ',',
 'y',
 '约束条件',
 '[SEP]',
 'z',
 '=',
 'x',
 '+',
 '7',
 'y',
 '最大值',
 '[MARK]']
[2]:
len(token_items[0])
[2]:
19

2. train and test model by ‘bow’

[3]:
from EduNLP.Pretrain import train_vector
#10 dimension with fasstext method
train_vector(token_items, "../../../data/d2v/gensim_luna_stem_tf_", method="bow")
EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_bow.bin
[3]:
'../../../data/d2v/gensim_luna_stem_tf_bow.bin'
[4]:
from EduNLP.Vector import D2V

d2v = D2V("../../../data/d2v/gensim_luna_stem_tf_bow.bin", method = "bow")
print(d2v(token_items[1]))
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

3. train and test model by ‘tfidf’

[5]:
from EduNLP.Pretrain import train_vector
#10 dimension with fasstext method
train_vector(token_items, "../../../data/d2v/gensim_luna_stem_tf_", method="tfidf")
EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_bow.bin
EduNLP, INFO model is saved to ../../../data/d2v/gensim_luna_stem_tf_tfidf.bin
[5]:
'../../../data/d2v/gensim_luna_stem_tf_tfidf.bin'
[6]:
from EduNLP.Vector import D2V

d2v = D2V("../../../data/d2v/gensim_luna_stem_tf_tfidf.bin", method = "tfidf")
vec_size = d2v.vector_size
print("vec_size = ", vec_size)
d2v(token_items[1])
vec_size =  63
[6]:
[(15, 0.37858374396389033),
 (16, 0.37858374396389033),
 (17, 0.37858374396389033),
 (18, 0.2646186811599866),
 (19, 0.37858374396389033),
 (20, 0.2646186811599866),
 (21, 0.37858374396389033),
 (22, 0.37858374396389033)]