Commit 7e18ef67 authored by sanlu's avatar sanlu

new

parents
Pipeline #88 failed with stages
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="projectConfiguration" value="Nosetests" />
<option name="PROJECT_TEST_RUNNER" value="Nosetests" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/bin/python)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/illness_entity_recognize.iml" filepath="$PROJECT_DIR$/.idea/illness_entity_recognize.iml" />
</modules>
</component>
</project>
\ No newline at end of file
This diff is collapsed.
# MedicalNamedEntityRecognition
Medical Named Entity Recognition implement using bi-directional lstm and crf model with char embedding.CCKS2017中文电子病例命名实体识别项目,主要实现使用了基于字向量的四层双向LSTM与CRF模型的网络.该项目提供了原始训练数据样本(一般醒目,出院情况,病史情况,病史特点,诊疗经过)与转换版本,训练脚本,预训练模型,可用于序列标注研究.把玩和PK使用.
# 项目介绍
电子病历结构化是让计算机理解病历、应用病历的基础。基于对病历的结构化,可以计算出症状、疾病、药品、检查检验等多个知识点之间的关系及其概率,构建医疗领域的知识图谱,进一步优化医生的工作.
CCKS2018的电子病历命名实体识别的评测任务,是对于给定的一组电子病历纯文本文档,识别并抽取出其中与医学临床相关的实体,并将它们归类到预先定义好的类别中。组委会针对这个评测任务,提供了600份标注好的电子病历文本,共需识别含解剖部位、独立症状、症状描述、手术和药物五类实体。
领域命名实体识别问题自然语言处理中经典的序列标注问题, 本项目是运用深度学习方法进行命名实体识别的一个尝试.
# 实验数据
一, 目标序列标记集合
O非实体部分,TREATMENT治疗方式, BODY身体部位, SIGN疾病症状, CHECK医学检查, DISEASE疾病实体,
二, 序列标记方法
采用BIO三元标记
self.class_dict ={
'O':0,
'TREATMENT-I': 1,
'TREATMENT-B': 2,
'BODY-B': 3,
'BODY-I': 4,
'SIGNS-I': 5,
'SIGNS-B': 6,
'CHECK-B': 7,
'CHECK-I': 8,
'DISEASE-I': 9,
'DISEASE-B': 10
}
三, 数据转换
评测方提供了四个目录(一般项目, 出院项目, 病史特点, 诊疗经过),四个目录下有txtoriginal文件和txt标注文件,内容样式如下:
一般项目-1.txtoriginal.txt
女性,88岁,农民,双滦区应营子村人,主因右髋部摔伤后疼痛肿胀,活动受限5小时于2016-10-29;11:12入院。
一般项目-1.txt:
右髋部 21 23 身体部位
疼痛 27 28 症状和体征
肿胀 29 30 症状和体征
转换脚本函数:
def transfer(self):
f = open(self.train_filepath, 'w+')
count = 0
for root,dirs,files in os.walk(self.origin_path):
for file in files:
filepath = os.path.join(root, file)
if 'original' not in filepath:
continue
label_filepath = filepath.replace('.txtoriginal','')
print(filepath, '\t\t', label_filepath)
content = open(filepath).read().strip()
res_dict = {}
for line in open(label_filepath):
res = line.strip().split(' ')
start = int(res[1])
end = int(res[2])
label = res[3]
label_id = self.label_dict.get(label)
for i in range(start, end+1):
if i == start:
label_cate = label_id + '-B'
else:
label_cate = label_id + '-I'
res_dict[i] = label_cate
for indx, char in enumerate(content):
char_label = res_dict.get(indx, 'O')
print(char, char_label)
f.write(char + '\t' + char_label + '\n')
f.close()
return
模型输出样式:
, O
男 O
, O
双 O
塔 O
山 O
人 O
, O
主 O
因 O
咳 SIGNS-B
嗽 SIGNS-I
、 O
少 SIGNS-B
痰 SIGNS-I
1 O
个 O
月 O
, O
加 O
重 O
3 O
天 O
, O
抽 SIGNS-B
搐 SIGNS-I
# 模型搭建
本模型使用预训练字向量,作为embedding层输入,然后经过两个双向LSTM层进行编码,编码后加入dense层,最后送入CRF层进行序列标注.
'''使用预训练向量进行模型训练'''
def tokenvec_bilstm2_crf_model(self):
model = Sequential()
embedding_layer = Embedding(self.VOCAB_SIZE + 1,
self.EMBEDDING_DIM,
weights=[self.embedding_matrix],
input_length=self.TIME_STAMPS,
trainable=False,
mask_zero=True)
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
model.add(crf_layer)
model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
model.summary()
return model
# 模型效果
1, 模型的训练:
| 模型 | 训练集 | 测试集 |训练集准确率 |测试集准确率 |备注|
| :--- | :---: | :---: | :--- |:--- |:--- |
| 医疗实体识别 | 6268 | 1571| 0.9649|0.8451|5个epcho|
2, 模型的测试:
python lstm_predict.py, 对训练好的实体识别模型进行测试,测试效果如下:
enter an sent:他最近头痛,流鼻涕,估计是发烧了
[('他', 'O'), ('最', 'O'), ('近', 'O'), ('头', 'SIGNS-B'), ('痛', 'SIGNS-I'), (',', 'O'), ('流', 'O'), ('鼻', 'O'), ('涕', 'O'), (',', 'O'), ('估', 'O'), ('计', 'O'), ('是', 'O'), ('发', 'SIGNS-B'), ('烧', 'SIGNS-I'), ('了', 'SIGNS-I')]
enter an sent:口腔溃疡可能需要多吃维生素
[('口', 'BODY-B'), ('腔', 'BODY-I'), ('溃', 'O'), ('疡', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('多', 'O'), ('吃', 'O'), ('维', 'CHECK-B'), ('生', 'CHECK-B'), ('素', 'TREATMENT-I')]
enter an sent:他骨折了,可能需要拍片
[('他', 'O'), ('骨', 'SIGNS-B'), ('折', 'SIGNS-I'), ('了', 'O'), (',', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('拍', 'O'), ('片', 'CHECK-I')]
# 总结
1,本项目针对中文电子病例命名实体任务,实现了一个基于Bilstm+CRF的命名实体识别模型
2,本项目使用charembedding作为原始特征,训练集准确率为0.9649,测试集准确达到0.8451
3,命名实体识别可以加入更多的特征进行训练,后期将逐步实验其他方式.
# contact
如有自然语言处理、知识图谱、事理图谱、社会计算、语言资源建设等问题或合作,请联系我:
邮箱:lhy_in_blcu@126.com
csdn:https://blog.csdn.net/lhy2014
我的自然语言处理项目: https://liuhuanyong.github.io/
刘焕勇,中国科学院软件研究所
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This diff is collapsed.
import pickle
import pymssql
class sql_find():
def __init__ (self, database='ZI_DataBase', localhost=True):
if localhost:
self.conn = pymssql.connect(host='localhost', user='zgc',password='1234',database=database,autocommit=True)
else:
self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database=database,autocommit=True)
self.cursor = self.conn.cursor()
def dict_create(categorycode):
categorycode = str(categorycode).zfill(4)
mssql_find = sql_find(localhost=False)
class_dict = {'O':0}
m = 0
mssql_find.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')")
subtitle_list = mssql_find.cursor.fetchall()
for param in subtitle_list:
m += 1
class_dict[f'B-{param[0]}'] = m
m += 1
class_dict[f'I-{param[0]}'] = m
pickle.dump(class_dict,open(f'data/param_dict/{categorycode}_param_dict.txt','wb'))
return class_dict
# -*- coding: utf-8 -*-
import pymssql
import pandas as pd
import re
from lxml import etree
'''
class product():
def __init__(self, product_name, product_SKU, product_class_num = 'na'):
self.product_name = str(product_name)
self.product_SKU = str(product_SKU)
self.product_class = str(product_class_num)
def get_parameter(self, **kwargs):
'''
class sql_find():
def __init__ (self, database='ZI_DataBase', localhost=True):
if localhost:
self.conn = pymssql.connect(host='localhost', user='zgc',password='1234',database=database,autocommit=True)
else:
self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database=database,autocommit=True)
self.cursor = self.conn.cursor()
class mysql_find():
def __init__ (self, database='ZI_DataBase', localhost=True):
if localhost:
self.conn = pymssql.connect(host='localhost', user='zgc',password='1234',database=database,autocommit=True)
else:
self.conn = pymssql.connect(host='59.110.219.171', user='root',password='qwertyuiop1',database=database,autocommit=True)
self.cursor = self.conn.cursor()
'''
def execute(self, sql_sentence):
self.cursor.execute(sql_sentence)
return self.cursor
'''
def BN(brand):
brand = str(brand)
try:
country = brand.split('[')[1].split(']')[-2]
brand = brand.replace(country,'')
except IndexError:
pass
res = re.findall(r'[0-9\u4E00-\u9FA5]', brand)
new_res = ''.join(res)
if new_res.isdigit():
new_res = ''
#print(len(new_res))
if len(new_res) == 0:
res1 = re.findall(r'[a-zA-Z0-9]', brand)
new_res = ''.join(res1)
new_res = new_res.upper()
return new_res
class Index(object):
def __init__(self, number=50, decimal=2):
"""
:param decimal: 你保留的保留小数位
:param number: # 号的 个数
"""
self.decimal = decimal
self.number = number
self.a = 100/number # 在百分比 为几时增加一个 # 号
def __call__(self, now, total):
# 1. 获取当前的百分比数
percentage = self.percentage_number(now, total)
# 2. 根据 现在百分比计算
well_num = int(percentage / self.a)
# print("well_num: ", well_num, percentage)
# 3. 打印字符进度条
progress_bar_num = self.progress_bar(well_num)
# 4. 完成的进度条
result = "\r%s %s" % (progress_bar_num, percentage)
return result
def percentage_number(self, now, total):
"""
计算百分比
:param now: 现在的数
:param total: 总数
:return: 百分
"""
return round(now / total * 100, self.decimal)
def progress_bar(self, num):
"""
显示进度条位置
:param num: 拼接的 “#” 号的
:return: 返回的结果当前的进度条
"""
# 1. "#" 号个数
well_num = "#" * num
# 2. 空格的个数
space_num = " " * (self.number - num)
return '[%s%s]' % (well_num, space_num)
class tool():
def __init__(self):
self.peijian_table = pd.read_excel('是否需要配件.xlsx')
self.brand_table = pd.read_excel('品牌对应表.xlsx')
def judge_brand(self, brand, brandcode_original):
if brandcode_original == '没有对应指数品牌':
BRANDID = '没有对应指数品牌'
for ID,Chinese_brand,English_brand in zip(self.brand_table['ID'], self.brand_table['中文品牌'], self.brand_table['英文品牌']):
if brand == Chinese_brand:
BRANDID = str(ID).zfill(5)
return BRANDID
elif BN(brand) == English_brand:
BRANDID = str(ID).zfill(5)
return BRANDID
else:
BRANDID = str(brandcode_original).zfill(5)[-5:]
return BRANDID
def judge_peijian(self, data_table):
ispeijian_lyst = []
isunique_lyst = []
for class_code in data_table['指数子类编码']:
mark = '0'
mark2 = '0'
class_code = str(class_code).zfill(4)
if class_code != '没有匹配的指数子类编码':
for categorycode, ispeijian, isunique in zip(self.peijian_table['categorycode'], self.peijian_table['ispeijian'], self.peijian_table['isunique']):
if class_code == str(categorycode).zfill(4):
if str(ispeijian) != '0':
mark = '1'
if str(isunique) != '0':
mark2 = '1'
break
ispeijian_lyst.append(mark)
isunique_lyst.append(mark2)
else:
ispeijian_lyst.append(mark)
isunique_lyst.append(mark2)
#print(len(ispeijian_lyst), len(data_table['指数子类编码']))
data_table['有无配件'] = ispeijian_lyst
data_table['型号_only'] = isunique_lyst
return data_table
def judge_unit(string):
unit_list = {'MM','CM', 'DM', 'ML', 'W', 'KW'}
if not string[0].isdigit():
return True
m = 0
for char in string:
if char.isdigit() or char == '.':
m += 1
continue
elif char.isalpha():
if string[m:].upper() in unit_list:
return False
else:
return True
return True
def type_extract_JD(name, params, brand):
#params = eval(params)
try:
brand_remove = re.findall(r"[A-Za-z0-9]+", brand)[0].upper()
except IndexError:
brand_remove = '没有英文品牌!'
param_xinghao = 'NA'
if '产品型号' in params:
param_xinghao = params['产品型号']
if '型号' in params:
param_xinghao = params['型号']
elif r'\t型号\t' in params:
param_xinghao = params[r'\t型号\t']
name_xinghao_lyst = list(filter(lambda x: len(x) >= 2, re.findall(r"[A-Za-z0-9-+/.*]+", name)))
for i in range(len(name_xinghao_lyst)):
name_xinghao_lyst[i] = name_xinghao_lyst[i].upper()
try:
name_xinghao_lyst.remove(brand_remove)
except ValueError:
pass
if len(name_xinghao_lyst) == 0:
#type_lyst.append(param_xinghao.upper())
return param_xinghao.upper()
else:
if param_xinghao in name_xinghao_lyst:
#type_lyst.append(param_xinghao.upper())
return param_xinghao.upper()
else:
xinghao_data = max(name_xinghao_lyst, key=len)
for xinghao in name_xinghao_lyst:
if len(xinghao) > 2 and '*' not in xinghao and judge_unit(xinghao):
xinghao_data = xinghao
break
if not judge_unit(xinghao_data):
xinghao_data == 'NA'
#type_lyst.append(xinghao_data.upper())
return xinghao_data
def type_extract(name, params):
#params = eval(params)
param_xinghao = 'NA'
if '型号' in params:
param_xinghao = params['型号']
elif r'\t型号\t' in params:
param_xinghao = params[r'\t型号\t']
name_xinghao_lyst = list(filter(lambda x: len(x) >= 2, re.findall(r"[A-Za-z0-9-+/.*]+", name)))
if len(name_xinghao_lyst) == 0:
#type_lyst.append(param_xinghao.upper())
return param_xinghao.upper()
else:
if param_xinghao in name_xinghao_lyst:
#type_lyst.append(param_xinghao.upper())
return param_xinghao.upper()
else:
xinghao_data = max(name_xinghao_lyst, key=len)
for xinghao in name_xinghao_lyst:
if len(xinghao) > 2 and '*' not in xinghao and judge_unit(xinghao):
xinghao_data = xinghao
break
if not judge_unit(xinghao_data):
xinghao_data == 'NA'
#type_lyst.append(xinghao_data.upper())
return xinghao_data.upper()
def param_load(product_id, xml_string):
"""
传入sku,和xml原始代码
:param product_id:sku
:param xml_string:xml数据
:return:csv
"""
xml_str = etree.HTML(xml_string)
#title = xml_str.xpath("//th[@class='tdTitle']")
secend = xml_str.xpath("//td[@class='tdTitle']")
zhi = xml_str.xpath("//tr//td[position()>1]")
data_dict = {}
for j, k in zip(secend, zhi):
#item = i.xpath("./text()")[0]
sec = j.xpath("./text()")[0]
value = k.xpath("./text()")[0]
data_dict[sec] = value
return data_dict
if __name__ == '__main__':
sqlserver = sql_find('ZI_BAK', True)
sqlserver.cursor.execute("select * from ZI_Price_Quote where productcode = '0506003750007'")
print(sqlserver.cursor.fetchall())
#!/usr/bin/env python3
# coding: utf-8
# File: lstm_predict.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-5-23
import numpy as np
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential,load_model
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from keras_contrib.layers.crf import CRF
import matplotlib.pyplot as plt
import os
import pickle
from keras import backend as K
from dict_creator import dict_create
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4'
class LSTMNER:
def __init__(self, categorycode):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.train_path = os.path.join(cur, f'data/{categorycode}_biaozhu.txt')
self.vocab_path = os.path.join(cur, f'model/{categorycode}_vocab.txt')
self.embedding_file = os.path.join(cur, f'model/{categorycode}_vec_300.bin')
self.model_path = os.path.join(cur, f'model/{categorycode}_bilstm2_crf_model.h5')
self.word_dict = self.load_worddict()
if os.path.exists(f'data/param_dict/{categorycode}_param_dict.txt'):
self.class_dict = pickle.load(open(f'data/param_dict/{categorycode}_param_dict.txt','rb'))
else:
self.class_dict = dict_create(categorycode)
self.label_dict = {j:i for i,j in self.class_dict.items()}
self.EMBEDDING_DIM = 300
self.EPOCHS = 10
self.BATCH_SIZE = 128
self.NUM_CLASSES = len(self.class_dict)
self.VOCAB_SIZE = len(self.word_dict)+1
self.TIME_STAMPS = 150
self.embedding_matrix = self.build_embedding_matrix()
self.model = self.tokenvec_bilstm2_crf_model()
self.model.load_weights(self.model_path)
'加载词表'
def load_worddict(self):
vocabs = [line.decode().strip() for line in open(self.vocab_path,'rb')]
word_dict = {wd: index for index, wd in enumerate(vocabs)}
return word_dict
'''构造输入,转换成所需形式'''
def build_input(self, text):
x = []
for char in text:
if char not in self.word_dict:
char = 'UNK'
x.append(self.word_dict.get(char))
x = pad_sequences([x], self.TIME_STAMPS)
return x
def predict(self, text):
str = self.build_input(text)
raw = self.model.predict(str)[0][-self.TIME_STAMPS:]
result = [np.argmax(row) for row in raw]
chars = [i for i in text]
tags = [self.label_dict[i] for i in result][len(result)-len(text):]
res = list(zip(chars, tags))
#print(res)
return res
'''加载预训练词向量'''
def load_pretrained_embedding(self):
embeddings_dict = {}
with open(self.embedding_file, 'rb') as f:
for line in f:
line = line.decode()
values = line.strip().split(' ')
if len(values) < 300:
continue
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_dict[word] = coefs
print('Found %s word vectors.' % len(embeddings_dict))
return embeddings_dict
'''加载词向量矩阵'''
def build_embedding_matrix(self):
embedding_dict = self.load_pretrained_embedding()
embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
for word, i in self.word_dict.items():
embedding_vector = embedding_dict.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
'''使用预训练向量进行模型训练'''
def tokenvec_bilstm2_crf_model(self):
model = Sequential()
embedding_layer = Embedding(self.VOCAB_SIZE + 1,
self.EMBEDDING_DIM,
weights=[self.embedding_matrix],
input_length=self.TIME_STAMPS,
trainable=False,
mask_zero=True)
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
model.add(crf_layer)
model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
model.summary()
return model
def clean(self):
K.clear_session()
del self.model
del self.class_dict
del self.embedding_matrix
del self.word_dict
def param_extract(self, sentence):
senetnce = sentence.strip()
param_extract = self.predict(sentence)
param_dict = {}
for i in range(len(param_extract)):
if param_extract[i][1] == 'O':
continue
elif param_extract[i][1].split('-')[0] == 'I':
continue
else:
#print(param_extract[i][1])
param_key = param_extract[i][1].split('-')[1]
param_start = i
param_end = len(senetnce)
for j in range(i+1,len(param_extract)):
#print(param_extract[j][1])
if param_extract[j][1] == '0':
param_end = j
break
elif param_extract[j][1].split('-')[0] != 'I':
param_end = j
break
param_dict[param_key] = sentence[param_start:param_end].strip()
return param_dict
if __name__ == '__main__':
ner_0101 = LSTMNER('0101')
while 1:
a = input('请输入产品名称:')
#print(ner.param_extract('联想 ThinkPad E580 ThinkPad E580(02CD)15.6英寸轻薄窄边框笔记本电脑(i5-8250U 8G 256G PCIeSSD+1T 2G独显 FHD)黑色(计价单位:台)'))
print(ner_0101.param_extract(a))
\ No newline at end of file
#!/usr/bin/env python3
# coding: utf-8
# File: lstm_train.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-12-24
import numpy as np
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
from keras_contrib.layers.crf import CRF
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import os
import pickle
from dict_creator import dict_create
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
class LSTMNER:
def __init__(self, categorycode):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.train_path = os.path.join(cur, f'data/{categorycode}_biaozhu.txt')
self.vocab_path = os.path.join(cur, f'model/{categorycode}_vocab.txt')
self.embedding_file = os.path.join(cur, f'model/{categorycode}_vec_300.bin')
self.model_path = os.path.join(cur, f'model/{categorycode}_bilstm2_crf_model.h5')
self.datas, self.word_dict = self.build_data()
if os.path.exists(f'data/param_dict/{categorycode}_param_dict.txt'):
self.class_dict = pickle.load(open(f'data/param_dict/{categorycode}_param_dict.txt','rb'))
else:
self.class_dict = dict_create(categorycode)
self.EMBEDDING_DIM = 300
self.EPOCHS = 5
self.BATCH_SIZE = 128
self.NUM_CLASSES = len(self.class_dict)
self.VOCAB_SIZE = len(self.word_dict)
self.TIME_STAMPS = 150
self.embedding_matrix = self.build_embedding_matrix()
'''构造数据集'''
def build_data(self):
datas = []
sample_x = []
sample_y = []
vocabs = {'UNK'}
for line in open(self.train_path,'r',encoding='utf-8'):
line = line.rstrip().split(' ')
if not line:
continue
char = line[0]
if not char:
continue
cate = line[-1].replace('M','I').replace('E','I')
sample_x.append(char)
sample_y.append(cate)
vocabs.add(char)
m = 0
if char in ['。','?','!','!','?',';']:
datas.append([sample_x, sample_y])
sample_x = []
sample_y = []
word_dict = {wd:index for index, wd in enumerate(list(vocabs))}
self.write_file(list(vocabs), self.vocab_path)
return datas, word_dict
'''将数据转换成keras所需的格式'''
def modify_data(self):
x_train = [[self.word_dict[char] for char in data[0]] for data in self.datas]
y_train = [[self.class_dict[label] for label in data[1]] for data in self.datas]
x_train = pad_sequences(x_train, self.TIME_STAMPS)
y = pad_sequences(y_train, self.TIME_STAMPS)
y_train = np.expand_dims(y, 2)
return x_train, y_train
'''保存字典文件'''
def write_file(self, wordlist, filepath):
with open(filepath, 'w+',encoding='utf-8') as f:
f.write('\n'.join(wordlist))
'''加载预训练词向量'''
def load_pretrained_embedding(self):
embeddings_dict = {}
with open(self.embedding_file, 'r',encoding='utf-8') as f:
for line in f:
values = line.strip().split(' ')
if len(values) < 300:
continue
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_dict[word] = coefs
print('Found %s word vectors.' % len(embeddings_dict))
return embeddings_dict
'''加载词向量矩阵'''
def build_embedding_matrix(self):
embedding_dict = self.load_pretrained_embedding()
embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
for word, i in self.word_dict.items():
embedding_vector = embedding_dict.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
'''使用预训练向量进行模型训练'''
def tokenvec_bilstm2_crf_model(self):
model = Sequential()
embedding_layer = Embedding(self.VOCAB_SIZE + 1,
self.EMBEDDING_DIM,
weights=[self.embedding_matrix],
input_length=self.TIME_STAMPS,
trainable=False,
mask_zero=True)
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
model.add(crf_layer)
model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
model.summary()
return model
'''训练模型'''
def train_model(self):
x_train, y_train = self.modify_data()
model = self.tokenvec_bilstm2_crf_model()
#model.load_weights(self.model_path)
kfold = StratifiedKFold(n_splits=self.EPOCHS, shuffle=True)
kfold_index = len(x_train)*['']
epoch_num = 0
for train,test in kfold.split(kfold_index, kfold_index):
epoch_num += 1
print(f'交叉训练第{epoch_num}轮。')
model.fit(x_train[train], y_train[train], validation_data=(x_train[test],y_train[test]), batch_size=self.BATCH_SIZE, epochs=5)
model.save(self.model_path)
print('模型训练完毕!')
#self.draw_train(history)
#model.save(self.model_path)
return model
'''绘制训练曲线'''
def draw_train(self, history):
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()
# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train'], loc='upper left')
plt.show()
# 7836/7836 [==============================] - 205s 26ms/step - loss: 17.1782 - acc: 0.9624
'''
6268/6268 [==============================] - 145s 23ms/step - loss: 18.5272 - acc: 0.7196 - val_loss: 15.7497 - val_acc: 0.8109
6268/6268 [==============================] - 142s 23ms/step - loss: 17.8446 - acc: 0.9099 - val_loss: 15.5915 - val_acc: 0.8378
6268/6268 [==============================] - 136s 22ms/step - loss: 17.7280 - acc: 0.9485 - val_loss: 15.5570 - val_acc: 0.8364
6268/6268 [==============================] - 133s 21ms/step - loss: 17.6918 - acc: 0.9593 - val_loss: 15.5187 - val_acc: 0.8451
6268/6268 [==============================] - 144s 23ms/step - loss: 17.6723 - acc: 0.9649 - val_loss: 15.4944 - val_acc: 0.8451
'''
if __name__ == '__main__':
ner = LSTMNER('0101')
ner.train_model()
\ No newline at end of file
This diff is collapsed.
W
x
I
2
E
Z
T
B
V
*
8
r
k

Q
"
%
7
R
F
C
P
i
)
6
a
;
S
U
0
K
9
?
n
线
e
H
+
J
3
D
绿
-
c
N
A
5
 
b
'
X
t
l
m
:
f
g
z
耀
G
.
v
p
h
s
d
y
L
/
便
w
4
UNK
M
u
O
1
o
Y
(
\ No newline at end of file
#!/usr/bin/env python3
# coding: utf-8
# File: transfer_data.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-12-24
import os
from collections import Counter
class TransferData:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.label_dict = {
'检查和检验': 'CHECK',
'症状和体征': 'SIGNS',
'疾病和诊断': 'DISEASE',
'治疗': 'TREATMENT',
'身体部位': 'BODY'}
self.cate_dict ={
'O':0,
'TREATMENT-I': 1,
'TREATMENT-B': 2,
'BODY-B': 3,
'BODY-I': 4,
'SIGNS-I': 5,
'SIGNS-B': 6,
'CHECK-B': 7,
'CHECK-I': 8,
'DISEASE-I': 9,
'DISEASE-B': 10
}
self.origin_path = os.path.join(cur, 'data_origin')
self.train_filepath = os.path.join(cur, 'train.txt')
return
def transfer(self):
f = open(self.train_filepath, 'w+')
count = 0
for root,dirs,files in os.walk(self.origin_path):
for file in files:
filepath = os.path.join(root, file)
if 'original' not in filepath:
continue
label_filepath = filepath.replace('.txtoriginal','')
print(filepath, '\t\t', label_filepath)
content = open(filepath).read().strip()
res_dict = {}
for line in open(label_filepath):
res = line.strip().split(' ')
start = int(res[1])
end = int(res[2])
label = res[3]
label_id = self.label_dict.get(label)
for i in range(start, end+1):
if i == start:
label_cate = label_id + '-B'
else:
label_cate = label_id + '-I'
res_dict[i] = label_cate
for indx, char in enumerate(content):
char_label = res_dict.get(indx, 'O')
print(char, char_label)
f.write(char + '\t' + char_label + '\n')
f.close()
return
if __name__ == '__main__':
handler = TransferData()
train_datas = handler.transfer()
\ No newline at end of file
# -*- coding:utf-8 -*-
import numpy as np
from gensim.models import word2vec
import multiprocessing
import torch.nn.functional as F
import os
def w2v_train(segment_dir = './data/segment/oil.txt', word2vec_path = './models/w2v/oil.model'):
sentences = word2vec.PathLineSentences(segment_dir)
model2 = train_wordVectors(sentences, embedding_size=300, window=5, min_count=1)
save_wordVectors(model2, word2vec_path)
def load_wordVectors(word2vec_path):
w2vModel = word2vec.Word2Vec.load(word2vec_path)
return w2vModel
def train_wordVectors(sentences, embedding_size = 300, window = 5, min_count = 5):
w2vModel = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count(),iter=10,hs=1)
return w2vModel
def save_wordVectors(w2vModel,word2vec_path):
w2vModel.save(word2vec_path)
# -*- coding: utf-8 -*-
import pymssql
import pandas as pd
from lxml import etree
import requests
import json
from function import *
import pickle
import decimal
import time
from w2v import *
def index_of_str(seq, sub_seq):
seq = ''.join(seq)
index=[]
n1=len(seq)
n2=len(sub_seq)
for i in range(n1-n2+1):
if seq[i:i+n2]==sub_seq:
index.append(i)
return index
SubCategoryCode = '0101'
sql_ZIdatabase = sql_find('ZI_DataBase', False)
sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where ispeijian = '1' and SubCategoryCode = '{SubCategoryCode}'")
subtitle_lyst = [str(f"'{x[0]}'") for x in sql_ZIdatabase.cursor.fetchall()]
subtitle_lyst = ','.join(subtitle_lyst)
sql_ZIdatabase.cursor.execute(f"select ProductName,参数名称,参数值 from vw_productValue where SubCategoryCode = '{SubCategoryCode}' and 参数名称 in ({subtitle_lyst}) and ProductName not like '%wrong'")
data = sql_ZIdatabase.cursor.fetchall()
data = pd.DataFrame(data,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
################################################################
f = open('Temporaryfolder/name_data.txt','w',encoding='utf-8')
for name in set(data['ProductName'].tolist()):
f.write(f'{name}\n')
f.close()
###############################################################
data = data.drop(['ProductName'], axis=1)
data = data.drop_duplicates()
data.to_excel(f'Temporaryfolder/{SubCategoryCode}_train_data.xlsx')
################################################################
f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
g = open('Temporaryfolder/name_data_w2v.txt','w',encoding='utf-8')
for line in f:
line = line.replace(' ','')
line = ' '.join(list(line))
g.write(line)
f.close()
g.close()
################################################################
f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
g = open(f'model/{SubCategoryCode}_vec_300.txt','w',encoding='utf-8')
w2v_train('Temporaryfolder/name_data_w2v.txt', f'{SubCategoryCode}.bin')
model_w2v = load_wordVectors(f'{SubCategoryCode}.bin')
word_data = []
for line in f:
line = line.replace(' ','')
line = list(line)
for word in line:
word_data.append(word)
print(len(word_data))
word_data = set(word_data)
word_data.remove('\u3000')
word_data.remove('\n')
print(len(word_data))
for word in word_data:
g.write(f"{word} {' '.join([str(x) for x in model_w2v[word].tolist()])}\n")
g.close()
f.close()
################################################################
f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
g = open(f'data/{SubCategoryCode}_train.txt','w',encoding='utf-8')
model_w2v = load_wordVectors('0101.bin')
word_data = [] #字序列
m = 0
for line in f:
m += 1
#line = line.replace(' ','')
line = list(line)
line.append(';')
for word in line:
if word != '\n':
word_data.append(word)
sign_data = [] #标记序列
for i in range(len(word_data)):
sign_data.append('O')
table = pd.read_excel(f'{SubCategoryCode}_train_data.xlsx')
o = 0
for param_name, param_value in zip(table['参数名称'],table['参数值']):
print(o,end = '\r')
o += 1
param_str_len = len(param_value)
if param_str_len < 2:
continue
param_value = param_value.upper()
sign_list = index_of_str(word_data, param_value)
for n in sign_list:
sign_data[n] = f'{param_name}-B'
for j in range(param_str_len-1):
sign_data[n+j+1] = f'{param_name}-I'
for word,sign in zip(word_data,sign_data):
g.write(f'{word}\t{sign}\n')
f.close()
g.close()
#将生成的"子类编码_vec_300.txt"放入model作为词向量,将生成的"子类编码_train.txt"放入data作为训练集。
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment