Commit fc402f95 authored by sanlu's avatar sanlu

for excel

parent 7e18ef67
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="Nosetests" />
<option name="PROJECT_TEST_RUNNER" value="Nosetests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/bin/python)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/illness_entity_recognize.iml" filepath="$PROJECT_DIR$/.idea/illness_entity_recognize.iml" />
</modules>
</component>
</project>
\ No newline at end of file
This diff is collapsed.
# MedicalNamedEntityRecognition
Medical Named Entity Recognition implement using bi-directional lstm and crf model with char embedding.CCKS2017中文电子病例命名实体识别项目,主要实现使用了基于字向量的四层双向LSTM与CRF模型的网络.该项目提供了原始训练数据样本(一般醒目,出院情况,病史情况,病史特点,诊疗经过)与转换版本,训练脚本,预训练模型,可用于序列标注研究.把玩和PK使用.
# 项目介绍
电子病历结构化是让计算机理解病历、应用病历的基础。基于对病历的结构化,可以计算出症状、疾病、药品、检查检验等多个知识点之间的关系及其概率,构建医疗领域的知识图谱,进一步优化医生的工作.
CCKS2018的电子病历命名实体识别的评测任务,是对于给定的一组电子病历纯文本文档,识别并抽取出其中与医学临床相关的实体,并将它们归类到预先定义好的类别中。组委会针对这个评测任务,提供了600份标注好的电子病历文本,共需识别含解剖部位、独立症状、症状描述、手术和药物五类实体。
领域命名实体识别问题自然语言处理中经典的序列标注问题, 本项目是运用深度学习方法进行命名实体识别的一个尝试.
# 实验数据
一, 目标序列标记集合
O非实体部分,TREATMENT治疗方式, BODY身体部位, SIGN疾病症状, CHECK医学检查, DISEASE疾病实体,
二, 序列标记方法
采用BIO三元标记
self.class_dict ={
'O':0,
'TREATMENT-I': 1,
'TREATMENT-B': 2,
'BODY-B': 3,
'BODY-I': 4,
'SIGNS-I': 5,
'SIGNS-B': 6,
'CHECK-B': 7,
'CHECK-I': 8,
'DISEASE-I': 9,
'DISEASE-B': 10
}
三, 数据转换
评测方提供了四个目录(一般项目, 出院项目, 病史特点, 诊疗经过),四个目录下有txtoriginal文件和txt标注文件,内容样式如下:
一般项目-1.txtoriginal.txt
女性,88岁,农民,双滦区应营子村人,主因右髋部摔伤后疼痛肿胀,活动受限5小时于2016-10-29;11:12入院。
一般项目-1.txt:
右髋部 21 23 身体部位
疼痛 27 28 症状和体征
肿胀 29 30 症状和体征
转换脚本函数:
def transfer(self):
f = open(self.train_filepath, 'w+')
count = 0
for root,dirs,files in os.walk(self.origin_path):
for file in files:
filepath = os.path.join(root, file)
if 'original' not in filepath:
continue
label_filepath = filepath.replace('.txtoriginal','')
print(filepath, '\t\t', label_filepath)
content = open(filepath).read().strip()
res_dict = {}
for line in open(label_filepath):
res = line.strip().split(' ')
start = int(res[1])
end = int(res[2])
label = res[3]
label_id = self.label_dict.get(label)
for i in range(start, end+1):
if i == start:
label_cate = label_id + '-B'
else:
label_cate = label_id + '-I'
res_dict[i] = label_cate
for indx, char in enumerate(content):
char_label = res_dict.get(indx, 'O')
print(char, char_label)
f.write(char + '\t' + char_label + '\n')
f.close()
return
模型输出样式:
, O
男 O
, O
双 O
塔 O
山 O
人 O
, O
主 O
因 O
咳 SIGNS-B
嗽 SIGNS-I
、 O
少 SIGNS-B
痰 SIGNS-I
1 O
个 O
月 O
, O
加 O
重 O
3 O
天 O
, O
抽 SIGNS-B
搐 SIGNS-I
# 模型搭建
本模型使用预训练字向量,作为embedding层输入,然后经过两个双向LSTM层进行编码,编码后加入dense层,最后送入CRF层进行序列标注.
'''使用预训练向量进行模型训练'''
def tokenvec_bilstm2_crf_model(self):
model = Sequential()
embedding_layer = Embedding(self.VOCAB_SIZE + 1,
self.EMBEDDING_DIM,
weights=[self.embedding_matrix],
input_length=self.TIME_STAMPS,
trainable=False,
mask_zero=True)
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
model.add(crf_layer)
model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
model.summary()
return model
# 模型效果
1, 模型的训练:
| 模型 | 训练集 | 测试集 |训练集准确率 |测试集准确率 |备注|
| :--- | :---: | :---: | :--- |:--- |:--- |
| 医疗实体识别 | 6268 | 1571| 0.9649|0.8451|5个epcho|
2, 模型的测试:
python lstm_predict.py, 对训练好的实体识别模型进行测试,测试效果如下:
enter an sent:他最近头痛,流鼻涕,估计是发烧了
[('他', 'O'), ('最', 'O'), ('近', 'O'), ('头', 'SIGNS-B'), ('痛', 'SIGNS-I'), (',', 'O'), ('流', 'O'), ('鼻', 'O'), ('涕', 'O'), (',', 'O'), ('估', 'O'), ('计', 'O'), ('是', 'O'), ('发', 'SIGNS-B'), ('烧', 'SIGNS-I'), ('了', 'SIGNS-I')]
enter an sent:口腔溃疡可能需要多吃维生素
[('口', 'BODY-B'), ('腔', 'BODY-I'), ('溃', 'O'), ('疡', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('多', 'O'), ('吃', 'O'), ('维', 'CHECK-B'), ('生', 'CHECK-B'), ('素', 'TREATMENT-I')]
enter an sent:他骨折了,可能需要拍片
[('他', 'O'), ('骨', 'SIGNS-B'), ('折', 'SIGNS-I'), ('了', 'O'), (',', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('拍', 'O'), ('片', 'CHECK-I')]
# 总结
1,本项目针对中文电子病例命名实体任务,实现了一个基于Bilstm+CRF的命名实体识别模型
2,本项目使用charembedding作为原始特征,训练集准确率为0.9649,测试集准确达到0.8451
3,命名实体识别可以加入更多的特征进行训练,后期将逐步实验其他方式.
# contact
如有自然语言处理、知识图谱、事理图谱、社会计算、语言资源建设等问题或合作,请联系我:
邮箱:lhy_in_blcu@126.com
csdn:https://blog.csdn.net/lhy2014
我的自然语言处理项目: https://liuhuanyong.github.io/
刘焕勇,中国科学院软件研究所
File added
......@@ -122,7 +122,7 @@ class LSTMNER:
del self.word_dict
def param_extract(self, sentence):
senetnce = sentence.strip()
senetnce = sentence.strip().upper()
param_extract = self.predict(sentence)
param_dict = {}
for i in range(len(param_extract)):
......@@ -143,7 +143,8 @@ class LSTMNER:
elif param_extract[j][1].split('-')[0] != 'I':
param_end = j
break
param_dict[param_key] = sentence[param_start:param_end].strip()
if not param_key in param_dict.keys():
param_dict[param_key] = sentence[param_start:param_end].strip()
return param_dict
if __name__ == '__main__':
......
#!/usr/bin/env python3
# coding: utf-8
# File: lstm_train.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-12-24
import numpy as np
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from keras_contrib.layers.crf import CRF
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import os
import pickle
from dict_creator import dict_create
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
class LSTMNER:
def __init__(self, categorycode):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.train_path = os.path.join(cur, f'data/{categorycode}_biaozhu.txt')
self.vocab_path = os.path.join(cur, f'model/{categorycode}_vocab.txt')
self.embedding_file = os.path.join(cur, f'model/{categorycode}_vec_300.bin')
self.model_path = os.path.join(cur, f'model/{categorycode}_bilstm2_crf_model.h5')
self.datas, self.word_dict = self.build_data()
if os.path.exists(f'data/param_dict/{categorycode}_param_dict.txt'):
self.class_dict = pickle.load(open(f'data/param_dict/{categorycode}_param_dict.txt','rb'))
else:
self.class_dict = dict_create(categorycode)
self.EMBEDDING_DIM = 300
self.EPOCHS = 5
self.BATCH_SIZE = 128
self.NUM_CLASSES = len(self.class_dict)
self.VOCAB_SIZE = len(self.word_dict)
self.TIME_STAMPS = 150
self.embedding_matrix = self.build_embedding_matrix()
'''构造数据集'''
def build_data(self):
datas = []
sample_x = []
sample_y = []
vocabs = {'UNK'}
for line in open(self.train_path,'r',encoding='utf-8'):
line = line.rstrip().split(' ')
if not line:
continue
char = line[0]
if not char:
continue
cate = line[-1].replace('M','I').replace('E','I')
sample_x.append(char)
sample_y.append(cate)
vocabs.add(char)
m = 0
if char in ['。','?','!','!','?',';']:
datas.append([sample_x, sample_y])
sample_x = []
sample_y = []
word_dict = {wd:index for index, wd in enumerate(list(vocabs))}
self.write_file(list(vocabs), self.vocab_path)
return datas, word_dict
'''将数据转换成keras所需的格式'''
def modify_data(self):
x_train = [[self.word_dict[char] for char in data[0]] for data in self.datas]
y_train = [[self.class_dict[label] for label in data[1]] for data in self.datas]
x_train = pad_sequences(x_train, self.TIME_STAMPS)
y = pad_sequences(y_train, self.TIME_STAMPS)
y_train = np.expand_dims(y, 2)
return x_train, y_train
'''保存字典文件'''
def write_file(self, wordlist, filepath):
with open(filepath, 'w+',encoding='utf-8') as f:
f.write('\n'.join(wordlist))
'''加载预训练词向量'''
def load_pretrained_embedding(self):
embeddings_dict = {}
with open(self.embedding_file, 'r',encoding='utf-8') as f:
for line in f:
values = line.strip().split(' ')
if len(values) < 300:
continue
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_dict[word] = coefs
print('Found %s word vectors.' % len(embeddings_dict))
return embeddings_dict
'''加载词向量矩阵'''
def build_embedding_matrix(self):
embedding_dict = self.load_pretrained_embedding()
embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
for word, i in self.word_dict.items():
embedding_vector = embedding_dict.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
'''使用预训练向量进行模型训练'''
def tokenvec_bilstm2_crf_model(self):
model = Sequential()
embedding_layer = Embedding(self.VOCAB_SIZE + 1,
self.EMBEDDING_DIM,
weights=[self.embedding_matrix],
input_length=self.TIME_STAMPS,
trainable=False,
mask_zero=True)
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
model.add(crf_layer)
model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
model.summary()
return model
'''训练模型'''
def train_model(self):
x_train, y_train = self.modify_data()
model = self.tokenvec_bilstm2_crf_model()
#model.load_weights(self.model_path)
kfold = StratifiedKFold(n_splits=self.EPOCHS, shuffle=True)
kfold_index = len(x_train)*['']
epoch_num = 0
for train,test in kfold.split(kfold_index, kfold_index):
epoch_num += 1
print(f'交叉训练第{epoch_num}轮。')
model.fit(x_train[train], y_train[train], validation_data=(x_train[test],y_train[test]), batch_size=self.BATCH_SIZE, epochs=5)
model.save(self.model_path)
print('模型训练完毕!')
#self.draw_train(history)
#model.save(self.model_path)
return model
'''绘制训练曲线'''
def draw_train(self, history):
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()
# 7836/7836 [==============================] - 205s 26ms/step - loss: 17.1782 - acc: 0.9624
'''
6268/6268 [==============================] - 145s 23ms/step - loss: 18.5272 - acc: 0.7196 - val_loss: 15.7497 - val_acc: 0.8109
6268/6268 [==============================] - 142s 23ms/step - loss: 17.8446 - acc: 0.9099 - val_loss: 15.5915 - val_acc: 0.8378
6268/6268 [==============================] - 136s 22ms/step - loss: 17.7280 - acc: 0.9485 - val_loss: 15.5570 - val_acc: 0.8364
6268/6268 [==============================] - 133s 21ms/step - loss: 17.6918 - acc: 0.9593 - val_loss: 15.5187 - val_acc: 0.8451
6268/6268 [==============================] - 144s 23ms/step - loss: 17.6723 - acc: 0.9649 - val_loss: 15.4944 - val_acc: 0.8451
'''
if __name__ == '__main__':
ner = LSTMNER('0101')
ner.train_model()
\ No newline at end of file
import pandas as pd
from lstm_predict import LSTMNER
from function import *
import time
import os
#读取数据
data_table = pd.read_excel('combine.xlsx', converters={'SKU':str,'指数子类编码':str,'指数品牌编码':str})
channel = 'LXWL'
now_time = time.strftime("%Y-%m-%d", time.localtime())
sql_ZIdatabase = sql_find('ZI_DataBase', False)
if os.path.isfile(f'{channel}_参数对应表_{now_time}.xlsx'):
merge_data = pd.read_excel(f'{channel}_参数对应表_{now_time}.xlsx', converters={'ZI_SubCategoryCode':str})
else:
sql_ZIdatabase.cursor.execute(f"select * from Product_Relation_Attribute_SubTitle where Source = '{channel}'")
match_data = sql_ZIdatabase.cursor.fetchall()
match_data = pd.DataFrame(match_data,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
sql_ZIdatabase.cursor.execute('select * from vw_relation_property')
param_data_table = sql_ZIdatabase.cursor.fetchall()
param_data_table = pd.DataFrame(param_data_table,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
merge_data = pd.merge(param_data_table,match_data[['ZI_SubTitle','Other_SubTitle','ZI_SubCategoryCode']],left_on=['SubCategoryCode','SubTitle'], right_on=['ZI_SubCategoryCode','ZI_SubTitle'])
merge_data = merge_data.drop_duplicates().reset_index()
merge_data = merge_data[(merge_data['ZI_SubTitle'].isnull() == False) | (merge_data['ISimportant'] == 1) | (merge_data['ispeijian'] == 1)]
merge_data.to_excel(f'{channel}_参数对应表_{now_time}.xlsx')
category_list = set(data_table['指数子类编码'].tolist())
for categorycode in category_list:
model = LSTMNER(categorycode)
sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')")
param_standard_lyst = [x[0] for x in sql_ZIdatabase.cursor.fetchall()]
param_standard_dict = {}
for param_standard in param_standard_lyst:
param_standard_dict[param_standard] = []
param_table = merge_data[(merge_data['ZI_SubCategoryCode'] == categorycode) & ((merge_data['ISimportant'] == 1) | (merge_data['ispeijian'] == 1))]
param_dict = {}
ZI_subtitle_list = []
for other_subtitle,ZI_subtitle in zip(param_table['Other_SubTitle'],param_table['ZI_SubTitle']):
param_dict[other_subtitle] = ZI_subtitle #非标转标词典
ZI_subtitle_list.append(ZI_subtitle)
data_categorycode = data_table[data_table['指数子类编码'] == categorycode]
for name,params in zip(data_categorycode['产品名称'],data_categorycode['参数项']):
product_param_dict = {}
params = eval(params)
for param_key in params.keys():
if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys():
if param_key_2 in param_dict.keys():
product_param_dict[param_dict[param_key_2]] = params[param_key_2]
elif param_key_2 in ZI_subtitle_list:
product_param_dict[param_key_2] = params[param_key_2]
else:
pass
name_param_dict = model.param_extract(name)
for name_param_key in name_param_dict.keys():
product_param_dict[name_param_key] = name_param_dict[name_param_key]
for key in param_standard_dict:
try:
param_standard_dict[key].append(product_param_dict[key])
except:
param_standard_dict[key].append('')
model.clean()
for key in param_standard_dict:
data_categorycode[key] = param_standard_dict[key]
#data_categorycode['整合参数项'] = product_param_dict_lyst
data_categorycode.to_excel(f'out/{categorycode}_param_extract.xlsx')
from lstm_predict import LSTMNER
model = LSTMNER('0101')
model.param_extract('联想 ThinkPad E580 ThinkPad E580(02CD)15.6英寸轻薄窄边框笔记本电脑(i5-8250U 8G 256G PCIeSSD+1T 2G独显 FHD)黑色(计价单位:台)')
\ No newline at end of file
#!/usr/bin/env python3
# coding: utf-8
# File: transfer_data.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-12-24
import os
from collections import Counter
class TransferData:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.label_dict = {
'检查和检验': 'CHECK',
'症状和体征': 'SIGNS',
'疾病和诊断': 'DISEASE',
'治疗': 'TREATMENT',
'身体部位': 'BODY'}
self.cate_dict ={
'O':0,
'TREATMENT-I': 1,
'TREATMENT-B': 2,
'BODY-B': 3,
'BODY-I': 4,
'SIGNS-I': 5,
'SIGNS-B': 6,
'CHECK-B': 7,
'CHECK-I': 8,
'DISEASE-I': 9,
'DISEASE-B': 10
}
self.origin_path = os.path.join(cur, 'data_origin')
self.train_filepath = os.path.join(cur, 'train.txt')
return
def transfer(self):
f = open(self.train_filepath, 'w+')
count = 0
for root,dirs,files in os.walk(self.origin_path):
for file in files:
filepath = os.path.join(root, file)
if 'original' not in filepath:
continue
label_filepath = filepath.replace('.txtoriginal','')
print(filepath, '\t\t', label_filepath)
content = open(filepath).read().strip()
res_dict = {}
for line in open(label_filepath):
res = line.strip().split(' ')
start = int(res[1])
end = int(res[2])
label = res[3]
label_id = self.label_dict.get(label)
for i in range(start, end+1):
if i == start:
label_cate = label_id + '-B'
else:
label_cate = label_id + '-I'
res_dict[i] = label_cate
for indx, char in enumerate(content):
char_label = res_dict.get(indx, 'O')
print(char, char_label)
f.write(char + '\t' + char_label + '\n')
f.close()
return
if __name__ == '__main__':
handler = TransferData()
train_datas = handler.transfer()
\ No newline at end of file
# -*- coding: utf-8 -*-
import pymssql
import pandas as pd
from lxml import etree
import requests
import json
from function import *
import pickle
import decimal
import time
from w2v import *
def index_of_str(seq, sub_seq):
seq = ''.join(seq)
index=[]
n1=len(seq)
n2=len(sub_seq)
for i in range(n1-n2+1):
if seq[i:i+n2]==sub_seq:
index.append(i)
return index
SubCategoryCode = '0101'
sql_ZIdatabase = sql_find('ZI_DataBase', False)
sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where ispeijian = '1' and SubCategoryCode = '{SubCategoryCode}'")
subtitle_lyst = [str(f"'{x[0]}'") for x in sql_ZIdatabase.cursor.fetchall()]
subtitle_lyst = ','.join(subtitle_lyst)
sql_ZIdatabase.cursor.execute(f"select ProductName,参数名称,参数值 from vw_productValue where SubCategoryCode = '{SubCategoryCode}' and 参数名称 in ({subtitle_lyst}) and ProductName not like '%wrong'")
data = sql_ZIdatabase.cursor.fetchall()
data = pd.DataFrame(data,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
################################################################
f = open('Temporaryfolder/name_data.txt','w',encoding='utf-8')
for name in set(data['ProductName'].tolist()):
f.write(f'{name}\n')
f.close()
###############################################################
data = data.drop(['ProductName'], axis=1)
data = data.drop_duplicates()
data.to_excel(f'Temporaryfolder/{SubCategoryCode}_train_data.xlsx')
################################################################
f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
g = open('Temporaryfolder/name_data_w2v.txt','w',encoding='utf-8')
for line in f:
line = line.replace(' ','')
line = ' '.join(list(line))
g.write(line)
f.close()
g.close()
################################################################
f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
g = open(f'model/{SubCategoryCode}_vec_300.txt','w',encoding='utf-8')
w2v_train('Temporaryfolder/name_data_w2v.txt', f'{SubCategoryCode}.bin')
model_w2v = load_wordVectors(f'{SubCategoryCode}.bin')
word_data = []
for line in f:
line = line.replace(' ','')
line = list(line)
for word in line:
word_data.append(word)
print(len(word_data))
word_data = set(word_data)
word_data.remove('\u3000')
word_data.remove('\n')
print(len(word_data))
for word in word_data:
g.write(f"{word} {' '.join([str(x) for x in model_w2v[word].tolist()])}\n")
g.close()
f.close()
################################################################
f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
g = open(f'data/{SubCategoryCode}_train.txt','w',encoding='utf-8')
model_w2v = load_wordVectors('0101.bin')
word_data = [] #字序列
m = 0
for line in f:
m += 1
#line = line.replace(' ','')
line = list(line)
line.append(';')
for word in line:
if word != '\n':
word_data.append(word)
sign_data = [] #标记序列
for i in range(len(word_data)):
sign_data.append('O')
table = pd.read_excel(f'{SubCategoryCode}_train_data.xlsx')
o = 0
for param_name, param_value in zip(table['参数名称'],table['参数值']):
print(o,end = '\r')
o += 1
param_str_len = len(param_value)
if param_str_len < 2:
continue
param_value = param_value.upper()
sign_list = index_of_str(word_data, param_value)
for n in sign_list:
sign_data[n] = f'{param_name}-B'
for j in range(param_str_len-1):
sign_data[n+j+1] = f'{param_name}-I'
for word,sign in zip(word_data,sign_data):
g.write(f'{word}\t{sign}\n')
f.close()
g.close()
#将生成的"子类编码_vec_300.txt"放入model作为词向量,将生成的"子类编码_train.txt"放入data作为训练集。
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment