d2v_general¶
1. Get token example from item¶
Notes: use geneal(‘linear’) tokenizition method, which means do not parse formulas
[1]:
# coding: utf-8
import json
from tqdm import tqdm
from EduNLP.SIF.segment import seg
from EduNLP.SIF.tokenization import tokenize
from EduNLP.Pretrain import GensimWordTokenizer
def load_items():
test_items = [
{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
{'ques_content':'如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
{'ques_content':'<div>Below is a discussion on a website.<br><table border=\1'},
]
for line in test_items:
yield line
# """or use your jsonfile like this"""
# work_file_path = "../../../data/OpenLUNA.json"
# with open(work_file_path, 'r', encoding="utf-8") as f:
# for line in f:
# yield json.loads(line)
token_items = []
for item in tqdm(load_items(), "sifing"):
# transform content into special marks('g','m','a','s'), except text('t') and formula('f').
# 'general' means symbolize the Formula in figure format and use 'linear' method for formula segmentation
tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
token_item = tokenizer(item["ques_content"])
if token_item:
token_items.append(token_item.tokens)
token_items[0]
D:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
sifing: 3it [00:00, 5.07it/s]
[1]:
['公式',
'[FORMULA]',
'公式',
'[FORMULA]',
'如图',
'[FIGURE]',
'x',
',',
'y',
'约束条件',
'[SEP]',
'z',
'=',
'x',
'+',
'7',
'y',
'最大值',
'[MARK]']
[2]:
len(token_items)
[2]:
3
2. Load Model and test item¶
[7]:
from urllib.request import urlopen
import os,os.path
import zipfile
def down_file(subject):
url = "http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_" + subject + "_256.zip"
file_name = "../../../data/d2v/zip/" + url.split('/')[-1]
u = urlopen(url)
f = open(file_name, 'wb')
file_info = u.getheaders()
print("[down file] file info : ", file_info)
file_size_dl = 0
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
file_size_dl += len(buffer)
f.write(buffer)
f.close()
print("[down file] finish !")
def unzip_file(subject):
zipfilename = "../../../data/d2v/zip/general_" + subject + "_256.zip"
unziptodir = "../../../data/d2v/models/"
print("[unzip file] start ...")
if not os.path.exists(unziptodir):
os.mkdir(unziptodir)
zfobj = zipfile.ZipFile(zipfilename)
for name in zfobj.namelist():
name = name.replace('\\','/')
if name.endswith('/'):
continue
ext_filename = os.path.join(unziptodir, name)
ext_filename = ext_filename.replace('\\','/')
print("save ======> ",ext_filename)
ext_path= os.path.dirname(ext_filename)
if not os.path.exists(ext_path) :
os.mkdir(ext_path)
outfile = open(ext_filename, 'wb')
outfile.write(zfobj.read(name))
outfile.close()
print("[unzip file] finish !")
def getData(subject = "english"):
""" subject = english | liberal | science |all """
down_file(subject)
unzip_file(subject)
work_subject = "science"
getData(work_subject)
[down file] file info : [('Server', 'nginx'), ('Date', 'Thu, 08 Jul 2021 14:05:55 GMT'), ('Content-Type', 'application/zip'), ('Content-Length', '2035517115'), ('Connection', 'close'), ('Last-Modified', 'Thu, 08 Jul 2021 13:24:26 GMT'), ('ETag', '"60e6fc8a-795386bb"'), ('Accept-Ranges', 'bytes')]
[down file] finish !
[unzip file] start ...
save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin
save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin.dv.vectors.npy
save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin.syn1neg.npy
save ======> ../../../data/d2v/models/general_science_256/general_science_256.bin.wv.vectors.npy
[unzip file] finish !
[4]:
print(token_items[0])
['如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
[2]:
from EduNLP.Vector import D2V
work_subject = "science"
d2v = D2V("../../../data/d2v/models/general_" + work_subject +"_256/general_" + work_subject + "_256.bin")
print(d2v.vector_size)
d2v(token_items[0])
256
[2]:
array([ 6.68359101e-02, -6.85622962e-03, 1.71755534e-03, -9.45999995e-02,
5.71297631e-02, -1.14749409e-01, -1.06426410e-01, -5.48244826e-02,
-1.01055816e-01, 6.82074800e-02, -3.01527120e-02, 1.88328531e-02,
-5.40650599e-02, 1.96987823e-01, 7.23450258e-02, -7.86591992e-02,
2.52593309e-02, -8.93113762e-02, 5.15675824e-03, 1.25454620e-01,
1.75611585e-01, 7.01171979e-02, -4.82840873e-02, 5.61073385e-02,
4.38053571e-02, 8.21266770e-02, 2.25354582e-02, 2.86612101e-02,
6.49044961e-02, 4.38563228e-02, -5.53747378e-02, 3.68891433e-02,
4.41701710e-02, -1.57279179e-01, -1.71185300e-01, -9.53545198e-02,
-3.68149281e-02, 1.03217609e-01, -4.01013494e-02, 1.34829208e-02,
-3.90383117e-02, 4.31797989e-02, -1.31486431e-01, -6.81887381e-03,
-3.09619904e-02, 1.09645449e-01, 9.19818357e-02, 1.05142176e-01,
-8.25446919e-02, -1.10780641e-01, -7.99699128e-02, 4.87378612e-03,
5.09812087e-02, -1.88464615e-02, 4.43719625e-02, -2.79577565e-03,
5.48942536e-02, 7.99279436e-02, -1.14065006e-01, -6.10431209e-02,
2.25610659e-02, -3.98695990e-02, -6.11394234e-02, -5.44755235e-02,
7.43018761e-02, -4.14421707e-02, -1.59866199e-01, -6.57487512e-02,
-1.21370479e-01, 5.41980937e-02, 5.50763076e-03, 5.59395552e-03,
1.20198451e-01, 6.72993287e-02, -8.41371343e-02, -3.98931094e-02,
-5.98041154e-02, -6.74210638e-02, -8.08542073e-02, 4.32682643e-03,
3.98905091e-02, -5.25522307e-02, -8.63379464e-02, 5.52122667e-02,
-1.91897918e-02, 6.72513470e-02, 1.63677037e-02, -4.64263670e-02,
4.26646275e-03, 2.88309101e-02, -3.19259726e-02, 6.76017851e-02,
1.18875027e-01, 4.90449667e-02, -7.11180866e-02, 2.42015105e-02,
6.09337091e-02, 9.15575251e-02, 7.54630268e-02, -5.30363468e-04,
5.33818686e-03, 2.14987155e-02, 1.37690797e-01, -8.63378346e-02,
5.83221130e-02, -3.59287485e-02, 7.56779611e-02, 2.51492225e-02,
1.17275678e-02, 9.37244594e-02, 3.03551462e-02, -1.35064060e-02,
6.28025606e-02, -1.67514980e-01, -1.24259945e-02, -1.95242167e-02,
6.93811625e-02, 7.72726461e-02, 7.74716437e-02, -1.47965118e-01,
-4.22228361e-04, 1.83783751e-02, -1.19136199e-01, -3.13477665e-02,
6.60038590e-02, 2.46255528e-02, 2.11933651e-03, -9.49578434e-02,
-2.49075815e-02, 1.01346388e-01, -5.71207069e-02, -4.76290993e-02,
2.79998290e-03, -8.29489976e-02, 4.29078564e-02, 4.00602221e-02,
1.03404291e-01, 7.92418346e-02, -3.14001106e-02, 2.04087533e-02,
-9.57951397e-02, -7.60837719e-02, -1.74582575e-03, -4.40510325e-02,
6.49931505e-02, -1.44915171e-02, 3.33687216e-02, -2.45348830e-02,
-4.90438566e-03, 8.16997364e-02, 1.56976636e-02, -2.20130035e-03,
-3.88220809e-02, 4.17613201e-02, 1.23736160e-02, 2.39638099e-03,
7.04660639e-02, -8.40025023e-03, 8.84754434e-02, 4.73559313e-02,
1.60846859e-02, 6.38007149e-02, -8.88152346e-02, -5.36189489e-02,
-3.58884176e-03, -7.97238126e-02, -2.48845778e-02, 6.67371228e-02,
-1.27798110e-01, 5.20749278e-02, -1.03058614e-01, -9.93425995e-02,
6.30614609e-02, 6.55593872e-02, 2.47250423e-02, 1.01459853e-01,
8.41867253e-02, 1.90107450e-02, -5.06304689e-02, 9.08671319e-03,
-1.11649349e-01, 4.15530279e-02, 3.82142738e-02, 7.48702586e-02,
1.00878365e-01, 7.18154162e-02, 2.41982359e-02, 4.45286110e-02,
2.29161587e-02, -6.85874224e-02, -6.66044280e-02, 5.26503660e-02,
1.44319907e-02, 7.72640528e-03, 4.93934080e-02, -4.20203842e-02,
-1.19266249e-02, -3.40296179e-02, -5.05692326e-02, -1.01971209e-01,
5.03124930e-02, 1.07444279e-01, 2.78240931e-03, -6.46820664e-02,
2.53117532e-02, 1.04838371e-01, -5.48670478e-02, -8.49981084e-02,
-1.75488254e-04, -7.08199888e-02, -8.43240973e-03, 9.51339304e-02,
-1.88117087e-01, 1.78130921e-02, 2.86972504e-02, -5.94706945e-02,
4.38547023e-02, 4.58841883e-02, -3.49672660e-02, -6.55051991e-02,
-7.90929198e-02, 3.29272039e-02, 2.99417619e-02, 1.12901134e-02,
-6.14368394e-02, -2.01964248e-02, 3.12223360e-02, 8.69451910e-02,
-1.85837403e-01, -1.25434086e-01, 1.11888051e-02, -1.12750731e-01,
4.47746105e-02, -6.38351589e-02, 2.88816690e-02, -2.45125685e-02,
3.97114865e-02, 8.87534320e-02, 1.15282401e-01, -6.65650517e-02,
-9.49165039e-03, 4.97242734e-02, 1.17295712e-01, -1.91902611e-02,
-3.20644900e-02, 1.36362026e-02, -3.73102799e-02, 8.89487471e-03,
-2.56872289e-02, -7.46497372e-03, -1.25288516e-02, -1.08435608e-01,
-3.12000625e-02, -1.22699983e-01, 4.24938798e-02, -1.87821351e-02],
dtype=float32)