d2v_general

1. Get token example from item

Notes: use geneal(‘linear’) tokenizition method, which means do not parse formulas

[1]:
# coding: utf-8
import json
from tqdm import tqdm
from EduNLP.SIF.segment import seg
from EduNLP.SIF.tokenization import tokenize
from EduNLP.Pretrain import GensimWordTokenizer

def load_items():
    test_items = [
        {'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'ques_content':'如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'},
        {'ques_content':'<div>Below is a discussion on a website.<br><table border=\1'},
    ]
    for line in test_items:
        yield line
    # """or use your jsonfile like this"""
    # work_file_path = "../../../data/OpenLUNA.json"
    # with open(work_file_path, 'r', encoding="utf-8") as f:
    #     for line in f:
    #         yield json.loads(line)


token_items = []
for item in tqdm(load_items(), "sifing"):
    # transform content into special marks('g','m','a','s'), except text('t') and formula('f').
    # 'general' means symbolize the Formula in figure format and use 'linear' method for formula segmentation
    tokenizer = GensimWordTokenizer(symbol="gmas", general=True)
    token_item = tokenizer(item["ques_content"])
    if token_item:
        token_items.append(token_item.tokens)

token_items[0]
D:\MySoftwares\Anaconda\envs\data\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)
sifing: 3it [00:00,  5.07it/s]
[1]:
['公式',
 '[FORMULA]',
 '公式',
 '[FORMULA]',
 '如图',
 '[FIGURE]',
 'x',
 ',',
 'y',
 '约束条件',
 '[SEP]',
 'z',
 '=',
 'x',
 '+',
 '7',
 'y',
 '最大值',
 '[MARK]']
[2]:
len(token_items)
[2]:
3

2. Load Model and test item

[7]:
from urllib.request import urlopen
import os,os.path
import zipfile


def down_file(subject):
  url = "http://base.ustc.edu.cn/data/model_zoo/EduNLP/d2v/general_" + subject + "_256.zip"
  file_name = "../../../data/d2v/zip/" + url.split('/')[-1]
  u = urlopen(url)

  f = open(file_name, 'wb')
  file_info = u.getheaders()
  print("[down file] file info : ", file_info)
  file_size_dl = 0
  block_sz = 8192
  while True:
    buffer = u.read(block_sz)
    if not buffer:
      break
    file_size_dl += len(buffer)
    f.write(buffer)
  f.close()
  print("[down file] finish !")


def unzip_file(subject):
  zipfilename = "../../../data/d2v/zip/general_" + subject + "_256.zip"
  unziptodir = "../../../data/d2v/models/"
  print("[unzip file] start ...")
  if not os.path.exists(unziptodir):
      os.mkdir(unziptodir)
  zfobj = zipfile.ZipFile(zipfilename)
  for name in zfobj.namelist():
    name = name.replace('\\','/')
    if name.endswith('/'):
      continue
    ext_filename = os.path.join(unziptodir, name)
    ext_filename = ext_filename.replace('\\','/')
    print("save ======> ",ext_filename)
    ext_path= os.path.dirname(ext_filename)
    if not os.path.exists(ext_path) :
        os.mkdir(ext_path)
    outfile = open(ext_filename, 'wb')
    outfile.write(zfobj.read(name))
    outfile.close()
  print("[unzip file] finish !")

def getData(subject = "english"):
  """ subject = english | liberal | science |all """
  down_file(subject)
  unzip_file(subject)


work_subject = "science"
getData(work_subject)
[down file] file info :  [('Server', 'nginx'), ('Date', 'Thu, 08 Jul 2021 14:05:55 GMT'), ('Content-Type', 'application/zip'), ('Content-Length', '2035517115'), ('Connection', 'close'), ('Last-Modified', 'Thu, 08 Jul 2021 13:24:26 GMT'), ('ETag', '"60e6fc8a-795386bb"'), ('Accept-Ranges', 'bytes')]
[down file] finish !
[unzip file] start ...
save ======>  ../../../data/d2v/models/general_science_256/general_science_256.bin
save ======>  ../../../data/d2v/models/general_science_256/general_science_256.bin.dv.vectors.npy
save ======>  ../../../data/d2v/models/general_science_256/general_science_256.bin.syn1neg.npy
save ======>  ../../../data/d2v/models/general_science_256/general_science_256.bin.wv.vectors.npy
[unzip file] finish !
[4]:
print(token_items[0])
['如图', '[FIGURE]', 'x', ',', 'y', '约束条件', '[SEP]', 'z', '=', 'x', '+', '7', 'y', '最大值', '[MARK]']
[2]:
from EduNLP.Vector import D2V
work_subject = "science"
d2v = D2V("../../../data/d2v/models/general_" + work_subject +"_256/general_" + work_subject + "_256.bin")
print(d2v.vector_size)
d2v(token_items[0])
256
[2]:
array([ 6.68359101e-02, -6.85622962e-03,  1.71755534e-03, -9.45999995e-02,
        5.71297631e-02, -1.14749409e-01, -1.06426410e-01, -5.48244826e-02,
       -1.01055816e-01,  6.82074800e-02, -3.01527120e-02,  1.88328531e-02,
       -5.40650599e-02,  1.96987823e-01,  7.23450258e-02, -7.86591992e-02,
        2.52593309e-02, -8.93113762e-02,  5.15675824e-03,  1.25454620e-01,
        1.75611585e-01,  7.01171979e-02, -4.82840873e-02,  5.61073385e-02,
        4.38053571e-02,  8.21266770e-02,  2.25354582e-02,  2.86612101e-02,
        6.49044961e-02,  4.38563228e-02, -5.53747378e-02,  3.68891433e-02,
        4.41701710e-02, -1.57279179e-01, -1.71185300e-01, -9.53545198e-02,
       -3.68149281e-02,  1.03217609e-01, -4.01013494e-02,  1.34829208e-02,
       -3.90383117e-02,  4.31797989e-02, -1.31486431e-01, -6.81887381e-03,
       -3.09619904e-02,  1.09645449e-01,  9.19818357e-02,  1.05142176e-01,
       -8.25446919e-02, -1.10780641e-01, -7.99699128e-02,  4.87378612e-03,
        5.09812087e-02, -1.88464615e-02,  4.43719625e-02, -2.79577565e-03,
        5.48942536e-02,  7.99279436e-02, -1.14065006e-01, -6.10431209e-02,
        2.25610659e-02, -3.98695990e-02, -6.11394234e-02, -5.44755235e-02,
        7.43018761e-02, -4.14421707e-02, -1.59866199e-01, -6.57487512e-02,
       -1.21370479e-01,  5.41980937e-02,  5.50763076e-03,  5.59395552e-03,
        1.20198451e-01,  6.72993287e-02, -8.41371343e-02, -3.98931094e-02,
       -5.98041154e-02, -6.74210638e-02, -8.08542073e-02,  4.32682643e-03,
        3.98905091e-02, -5.25522307e-02, -8.63379464e-02,  5.52122667e-02,
       -1.91897918e-02,  6.72513470e-02,  1.63677037e-02, -4.64263670e-02,
        4.26646275e-03,  2.88309101e-02, -3.19259726e-02,  6.76017851e-02,
        1.18875027e-01,  4.90449667e-02, -7.11180866e-02,  2.42015105e-02,
        6.09337091e-02,  9.15575251e-02,  7.54630268e-02, -5.30363468e-04,
        5.33818686e-03,  2.14987155e-02,  1.37690797e-01, -8.63378346e-02,
        5.83221130e-02, -3.59287485e-02,  7.56779611e-02,  2.51492225e-02,
        1.17275678e-02,  9.37244594e-02,  3.03551462e-02, -1.35064060e-02,
        6.28025606e-02, -1.67514980e-01, -1.24259945e-02, -1.95242167e-02,
        6.93811625e-02,  7.72726461e-02,  7.74716437e-02, -1.47965118e-01,
       -4.22228361e-04,  1.83783751e-02, -1.19136199e-01, -3.13477665e-02,
        6.60038590e-02,  2.46255528e-02,  2.11933651e-03, -9.49578434e-02,
       -2.49075815e-02,  1.01346388e-01, -5.71207069e-02, -4.76290993e-02,
        2.79998290e-03, -8.29489976e-02,  4.29078564e-02,  4.00602221e-02,
        1.03404291e-01,  7.92418346e-02, -3.14001106e-02,  2.04087533e-02,
       -9.57951397e-02, -7.60837719e-02, -1.74582575e-03, -4.40510325e-02,
        6.49931505e-02, -1.44915171e-02,  3.33687216e-02, -2.45348830e-02,
       -4.90438566e-03,  8.16997364e-02,  1.56976636e-02, -2.20130035e-03,
       -3.88220809e-02,  4.17613201e-02,  1.23736160e-02,  2.39638099e-03,
        7.04660639e-02, -8.40025023e-03,  8.84754434e-02,  4.73559313e-02,
        1.60846859e-02,  6.38007149e-02, -8.88152346e-02, -5.36189489e-02,
       -3.58884176e-03, -7.97238126e-02, -2.48845778e-02,  6.67371228e-02,
       -1.27798110e-01,  5.20749278e-02, -1.03058614e-01, -9.93425995e-02,
        6.30614609e-02,  6.55593872e-02,  2.47250423e-02,  1.01459853e-01,
        8.41867253e-02,  1.90107450e-02, -5.06304689e-02,  9.08671319e-03,
       -1.11649349e-01,  4.15530279e-02,  3.82142738e-02,  7.48702586e-02,
        1.00878365e-01,  7.18154162e-02,  2.41982359e-02,  4.45286110e-02,
        2.29161587e-02, -6.85874224e-02, -6.66044280e-02,  5.26503660e-02,
        1.44319907e-02,  7.72640528e-03,  4.93934080e-02, -4.20203842e-02,
       -1.19266249e-02, -3.40296179e-02, -5.05692326e-02, -1.01971209e-01,
        5.03124930e-02,  1.07444279e-01,  2.78240931e-03, -6.46820664e-02,
        2.53117532e-02,  1.04838371e-01, -5.48670478e-02, -8.49981084e-02,
       -1.75488254e-04, -7.08199888e-02, -8.43240973e-03,  9.51339304e-02,
       -1.88117087e-01,  1.78130921e-02,  2.86972504e-02, -5.94706945e-02,
        4.38547023e-02,  4.58841883e-02, -3.49672660e-02, -6.55051991e-02,
       -7.90929198e-02,  3.29272039e-02,  2.99417619e-02,  1.12901134e-02,
       -6.14368394e-02, -2.01964248e-02,  3.12223360e-02,  8.69451910e-02,
       -1.85837403e-01, -1.25434086e-01,  1.11888051e-02, -1.12750731e-01,
        4.47746105e-02, -6.38351589e-02,  2.88816690e-02, -2.45125685e-02,
        3.97114865e-02,  8.87534320e-02,  1.15282401e-01, -6.65650517e-02,
       -9.49165039e-03,  4.97242734e-02,  1.17295712e-01, -1.91902611e-02,
       -3.20644900e-02,  1.36362026e-02, -3.73102799e-02,  8.89487471e-03,
       -2.56872289e-02, -7.46497372e-03, -1.25288516e-02, -1.08435608e-01,
       -3.12000625e-02, -1.22699983e-01,  4.24938798e-02, -1.87821351e-02],
      dtype=float32)