new

7e18ef67 · sanlu · 7e18ef67 · 7e18ef67 · 7e18ef67 · 7e18ef67
Commit 7e18ef67 authored Dec 05, 2019 by sanlu
27 changed files
--- a/.idea/illness_entity_recognize.iml
+++ b/.idea/illness_entity_recognize.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="projectConfiguration" value="Nosetests" />
+    <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
+  </component>
+</module>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/bin/python)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/illness_entity_recognize.iml" filepath="$PROJECT_DIR$/.idea/illness_entity_recognize.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
--- a/README.md
+++ b/README.md
+# MedicalNamedEntityRecognition
+Medical Named Entity Recognition implement using bi-directional lstm and crf  model with char embedding.CCKS2017中文电子病例命名实体识别项目,主要实现使用了基于字向量的四层双向LSTM与CRF模型的网络.该项目提供了原始训练数据样本(一般醒目,出院情况,病史情况,病史特点,诊疗经过)与转换版本,训练脚本,预训练模型,可用于序列标注研究.把玩和PK使用.
+
+# 项目介绍
+电子病历结构化是让计算机理解病历、应用病历的基础。基于对病历的结构化，可以计算出症状、疾病、药品、检查检验等多个知识点之间的关系及其概率，构建医疗领域的知识图谱，进一步优化医生的工作.
+CCKS2018的电子病历命名实体识别的评测任务，是对于给定的一组电子病历纯文本文档，识别并抽取出其中与医学临床相关的实体，并将它们归类到预先定义好的类别中。组委会针对这个评测任务，提供了600份标注好的电子病历文本，共需识别含解剖部位、独立症状、症状描述、手术和药物五类实体。
+领域命名实体识别问题自然语言处理中经典的序列标注问题, 本项目是运用深度学习方法进行命名实体识别的一个尝试.
+
+# 实验数据
+一, 目标序列标记集合
+    O非实体部分,TREATMENT治疗方式, BODY身体部位, SIGN疾病症状, CHECK医学检查, DISEASE疾病实体,
+二, 序列标记方法
+    采用BIO三元标记
+
+
+    self.class_dict ={
+                     'O':0,
+                     'TREATMENT-I': 1,
+                     'TREATMENT-B': 2,
+                     'BODY-B': 3,
+                     'BODY-I': 4,
+                     'SIGNS-I': 5,
+                     'SIGNS-B': 6,
+                     'CHECK-B': 7,
+                     'CHECK-I': 8,
+                     'DISEASE-I': 9,
+                     'DISEASE-B': 10
+                    }
+
+
+
+三, 数据转换
+    评测方提供了四个目录(一般项目, 出院项目, 病史特点, 诊疗经过),四个目录下有txtoriginal文件和txt标注文件,内容样式如下:
+
+一般项目-1.txtoriginal.txt
+
+    女性，88岁，农民，双滦区应营子村人，主因右髋部摔伤后疼痛肿胀，活动受限5小时于2016-10-29；11：12入院。
+一般项目-1.txt:
+
+    右髋部	21	23	身体部位
+    疼痛	27	28	症状和体征
+    肿胀	29	30	症状和体征
+
+转换脚本函数:
+
+      def transfer(self):
+        f = open(self.train_filepath, 'w+')
+        count = 0
+        for root,dirs,files in os.walk(self.origin_path):
+            for file in files:
+                filepath = os.path.join(root, file)
+                if 'original' not in filepath:
+                    continue
+                label_filepath = filepath.replace('.txtoriginal','')
+                print(filepath, '\t\t', label_filepath)
+                content = open(filepath).read().strip()
+                res_dict = {}
+                for line in open(label_filepath):
+                    res = line.strip().split('	')
+                    start = int(res[1])
+                    end = int(res[2])
+                    label = res[3]
+                    label_id = self.label_dict.get(label)
+                    for i in range(start, end+1):
+                        if i == start:
+                            label_cate = label_id + '-B'
+                        else:
+                            label_cate = label_id + '-I'
+                        res_dict[i] = label_cate
+
+                for indx, char in enumerate(content):
+                    char_label = res_dict.get(indx, 'O')
+                    print(char, char_label)
+                    f.write(char + '\t' + char_label + '\n')
+        f.close()
+        return
+
+模型输出样式:
+
+    ，	O
+    男	O
+    ，	O
+    双	O
+    塔	O
+    山	O
+    人	O
+    ，	O
+    主	O
+    因	O
+    咳	SIGNS-B
+    嗽	SIGNS-I
+    、	O
+    少	SIGNS-B
+    痰	SIGNS-I
+    1	O
+    个	O
+    月	O
+    ，	O
+    加	O
+    重	O
+    3	O
+    天	O
+    ，	O
+    抽	SIGNS-B
+    搐	SIGNS-I
+
+
+# 模型搭建
+   本模型使用预训练字向量,作为embedding层输入,然后经过两个双向LSTM层进行编码,编码后加入dense层,最后送入CRF层进行序列标注.
+
+       '''使用预训练向量进行模型训练'''
+    def tokenvec_bilstm2_crf_model(self):
+        model = Sequential()
+        embedding_layer = Embedding(self.VOCAB_SIZE + 1,
+                                    self.EMBEDDING_DIM,
+                                    weights=[self.embedding_matrix],
+                                    input_length=self.TIME_STAMPS,
+                                    trainable=False,
+                                    mask_zero=True)
+        model.add(embedding_layer)
+        model.add(Bidirectional(LSTM(128, return_sequences=True)))
+        model.add(Dropout(0.5))
+        model.add(Bidirectional(LSTM(64, return_sequences=True)))
+        model.add(Dropout(0.5))
+        model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
+        crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
+        model.add(crf_layer)
+        model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
+        model.summary()
+        return model
+
+
+# 模型效果  
+1, 模型的训练:  
+
+   | 模型 | 训练集 | 测试集 |训练集准确率 |测试集准确率 |备注|
+   | :--- | :---: | :---: | :--- |:--- |:--- |
+   | 医疗实体识别 | 6268 | 1571| 0.9649|0.8451|5个epcho|
+
+
+2, 模型的测试:
+    python lstm_predict.py, 对训练好的实体识别模型进行测试,测试效果如下:
+
+        enter an sent:他最近头痛,流鼻涕,估计是发烧了
+        [('他', 'O'), ('最', 'O'), ('近', 'O'), ('头', 'SIGNS-B'), ('痛', 'SIGNS-I'), (',', 'O'), ('流', 'O'), ('鼻', 'O'), ('涕', 'O'), (',', 'O'), ('估', 'O'), ('计', 'O'), ('是', 'O'), ('发', 'SIGNS-B'), ('烧', 'SIGNS-I'), ('了', 'SIGNS-I')]
+        enter an sent:口腔溃疡可能需要多吃维生素
+        [('口', 'BODY-B'), ('腔', 'BODY-I'), ('溃', 'O'), ('疡', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('多', 'O'), ('吃', 'O'), ('维', 'CHECK-B'), ('生', 'CHECK-B'), ('素', 'TREATMENT-I')]
+        enter an sent:他骨折了,可能需要拍片
+        [('他', 'O'), ('骨', 'SIGNS-B'), ('折', 'SIGNS-I'), ('了', 'O'), (',', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('拍', 'O'), ('片', 'CHECK-I')]
+
+# 总结    
+1,本项目针对中文电子病例命名实体任务,实现了一个基于Bilstm+CRF的命名实体识别模型  
+2,本项目使用charembedding作为原始特征,训练集准确率为0.9649,测试集准确达到0.8451  
+3,命名实体识别可以加入更多的特征进行训练,后期将逐步实验其他方式.  
+
+# contact 
+如有自然语言处理、知识图谱、事理图谱、社会计算、语言资源建设等问题或合作，请联系我:  
+邮箱:lhy_in_blcu@126.com  
+csdn:https://blog.csdn.net/lhy2014  
+我的自然语言处理项目: https://liuhuanyong.github.io/  
+刘焕勇，中国科学院软件研究所  
+
+
+
+
+
+
+
+
+
+
--- a/Temporaryfolder/0101_JD.txt
+++ b/Temporaryfolder/0101_JD.txt
--- a/Temporaryfolder/0101_LXWL.txt
+++ b/Temporaryfolder/0101_LXWL.txt
--- a/Temporaryfolder/0101_OFS.txt
+++ b/Temporaryfolder/0101_OFS.txt
--- a/Temporaryfolder/0101_all.txt
+++ b/Temporaryfolder/0101_all.txt
--- a/Temporaryfolder/name_data_w2v.txt
+++ b/Temporaryfolder/name_data_w2v.txt
--- a/__pycache__/dict_creator.cpython-36.pyc
+++ b/__pycache__/dict_creator.cpython-36.pyc
--- a/__pycache__/function.cpython-36.pyc
+++ b/__pycache__/function.cpython-36.pyc
--- a/__pycache__/w2v.cpython-36.pyc
+++ b/__pycache__/w2v.cpython-36.pyc
--- a/data/0101_biaozhu.txt
+++ b/data/0101_biaozhu.txt
--- a/data/0101_train.txt
+++ b/data/0101_train.txt
--- a/data/param_dict/0101_param_dict.txt
+++ b/data/param_dict/0101_param_dict.txt
--- a/data/param_dict/0101_param_dict_old.txt
+++ b/data/param_dict/0101_param_dict_old.txt
--- a/dict_creator.py
+++ b/dict_creator.py
+import pickle
+import pymssql
+class sql_find():
+    
+    def __init__ (self, database='ZI_DataBase', localhost=True):
+        if localhost:
+            self.conn = pymssql.connect(host='localhost', user='zgc',password='1234',database=database,autocommit=True)
+        else:
+            self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database=database,autocommit=True)
+        self.cursor = self.conn.cursor()
+
+def dict_create(categorycode):
+    categorycode = str(categorycode).zfill(4)
+    mssql_find = sql_find(localhost=False)
+    class_dict = {'O':0}
+    m = 0
+    mssql_find.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')")
+    subtitle_list = mssql_find.cursor.fetchall()
+    for param in subtitle_list:
+        m += 1
+        class_dict[f'B-{param[0]}'] = m
+        m += 1
+        class_dict[f'I-{param[0]}'] = m
+    pickle.dump(class_dict,open(f'data/param_dict/{categorycode}_param_dict.txt','wb'))
+    return class_dict
--- a/function.py
+++ b/function.py
+# -*- coding: utf-8 -*-
+import pymssql
+import pandas as pd
+import re
+from lxml import etree
+'''
+class product():
+    def __init__(self, product_name, product_SKU, product_class_num = 'na'):
+        self.product_name = str(product_name)
+        self.product_SKU = str(product_SKU)
+        self.product_class = str(product_class_num)
+
+    def get_parameter(self, **kwargs):
+'''
+
+class sql_find():
+    
+    def __init__ (self, database='ZI_DataBase', localhost=True):
+        if localhost:
+            self.conn = pymssql.connect(host='localhost', user='zgc',password='1234',database=database,autocommit=True)
+        else:
+            self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database=database,autocommit=True)
+        self.cursor = self.conn.cursor()
+
+class mysql_find():
+    
+    def __init__ (self, database='ZI_DataBase', localhost=True):
+        if localhost:
+            self.conn = pymssql.connect(host='localhost', user='zgc',password='1234',database=database,autocommit=True)
+        else:
+            self.conn = pymssql.connect(host='59.110.219.171', user='root',password='qwertyuiop1',database=database,autocommit=True)
+        self.cursor = self.conn.cursor()
+    '''
+    def execute(self, sql_sentence):
+        self.cursor.execute(sql_sentence)
+        return self.cursor
+    '''
+
+def BN(brand):
+    brand = str(brand)
+    try:
+        country = brand.split('[')[1].split(']')[-2]
+        brand = brand.replace(country,'')
+    except IndexError:
+        pass
+    res = re.findall(r'[0-9\u4E00-\u9FA5]', brand)
+    new_res = ''.join(res)
+    if new_res.isdigit():
+        new_res = ''
+	#print(len(new_res))
+    if len(new_res) == 0:
+        res1 = re.findall(r'[a-zA-Z0-9]', brand)
+        new_res = ''.join(res1)
+        new_res = new_res.upper()
+    return new_res
+    
+class Index(object):
+    def __init__(self, number=50, decimal=2):
+        """
+        :param decimal: 你保留的保留小数位
+        :param number: # 号的 个数
+        """
+        self.decimal = decimal
+        self.number = number
+        self.a = 100/number   # 在百分比 为几时增加一个 # 号
+ 
+    def __call__(self, now, total):
+        # 1. 获取当前的百分比数
+        percentage = self.percentage_number(now, total)
+ 
+        # 2. 根据 现在百分比计算
+        well_num = int(percentage / self.a)
+        # print("well_num: ", well_num, percentage)
+ 
+        # 3. 打印字符进度条
+        progress_bar_num = self.progress_bar(well_num)
+ 
+        # 4. 完成的进度条
+        result = "\r%s %s" % (progress_bar_num, percentage)
+        return result
+ 
+    def percentage_number(self, now, total):
+        """
+        计算百分比
+        :param now:  现在的数
+        :param total:  总数
+        :return: 百分
+        """
+        return round(now / total * 100, self.decimal)
+ 
+    def progress_bar(self, num):
+        """
+        显示进度条位置
+        :param num:  拼接的  “#” 号的
+        :return: 返回的结果当前的进度条
+        """
+        # 1. "#" 号个数
+        well_num = "#" * num
+ 
+        # 2. 空格的个数
+        space_num = " " * (self.number - num)
+ 
+        return '[%s%s]' % (well_num, space_num)
+
+class tool():
+    def __init__(self):
+        self.peijian_table = pd.read_excel('是否需要配件.xlsx')
+        self.brand_table = pd.read_excel('品牌对应表.xlsx')
+    
+    def judge_brand(self, brand, brandcode_original):
+        if brandcode_original == '没有对应指数品牌':
+            BRANDID = '没有对应指数品牌'
+            for ID,Chinese_brand,English_brand in zip(self.brand_table['ID'], self.brand_table['中文品牌'], self.brand_table['英文品牌']):
+                if brand == Chinese_brand:
+                    BRANDID = str(ID).zfill(5)
+                    return BRANDID
+                elif BN(brand) == English_brand:
+                    BRANDID = str(ID).zfill(5)
+                    return BRANDID
+        else:
+            BRANDID = str(brandcode_original).zfill(5)[-5:]
+        return BRANDID
+    
+    def judge_peijian(self, data_table):
+        ispeijian_lyst = []
+        isunique_lyst = []
+        for class_code in data_table['指数子类编码']:
+            mark = '0'
+            mark2 = '0'
+            class_code = str(class_code).zfill(4)
+            if class_code != '没有匹配的指数子类编码':
+                for categorycode, ispeijian, isunique in zip(self.peijian_table['categorycode'], self.peijian_table['ispeijian'], self.peijian_table['isunique']):
+                    if class_code == str(categorycode).zfill(4):
+                        if str(ispeijian) != '0':
+                            mark = '1'
+                        if str(isunique) != '0':
+                            mark2 = '1'
+                        break
+                ispeijian_lyst.append(mark)
+                isunique_lyst.append(mark2)
+            else:
+                ispeijian_lyst.append(mark)
+                isunique_lyst.append(mark2)
+        #print(len(ispeijian_lyst), len(data_table['指数子类编码']))
+        data_table['有无配件'] = ispeijian_lyst
+        data_table['型号_only'] = isunique_lyst
+        return data_table
+
+def judge_unit(string):
+    unit_list = {'MM','CM', 'DM', 'ML', 'W', 'KW'}
+    if not string[0].isdigit():
+        return True
+    m = 0
+    for char in string:
+        if char.isdigit() or char == '.':
+            m += 1
+            continue
+        elif char.isalpha():
+            if string[m:].upper() in unit_list:
+                return False
+            else:
+                return True
+    return True
+
+def type_extract_JD(name, params, brand):
+    #params = eval(params)
+    try:
+        brand_remove = re.findall(r"[A-Za-z0-9]+", brand)[0].upper()
+    except IndexError:
+        brand_remove = '没有英文品牌！'
+    param_xinghao = 'NA'
+    if '产品型号' in params:
+        param_xinghao = params['产品型号']
+    if '型号' in params:
+        param_xinghao = params['型号']
+    elif r'\t型号\t' in params:
+        param_xinghao = params[r'\t型号\t']
+    name_xinghao_lyst = list(filter(lambda x: len(x) >= 2, re.findall(r"[A-Za-z0-9-+/.*]+", name)))
+    for i in range(len(name_xinghao_lyst)):
+        name_xinghao_lyst[i] = name_xinghao_lyst[i].upper()
+    try:
+        name_xinghao_lyst.remove(brand_remove)
+    except ValueError:
+        pass
+    if len(name_xinghao_lyst) == 0:
+        #type_lyst.append(param_xinghao.upper())
+        return param_xinghao.upper()
+    else:
+        if param_xinghao in name_xinghao_lyst:
+            #type_lyst.append(param_xinghao.upper())
+            return param_xinghao.upper()
+        else:
+            xinghao_data = max(name_xinghao_lyst, key=len)
+            for xinghao in name_xinghao_lyst:
+                if len(xinghao) > 2 and '*' not in xinghao and judge_unit(xinghao):
+                    xinghao_data = xinghao
+                    break
+            if not judge_unit(xinghao_data):
+                xinghao_data == 'NA'
+            #type_lyst.append(xinghao_data.upper())
+            return xinghao_data
+
+
+def type_extract(name, params):
+    #params = eval(params)
+    param_xinghao = 'NA'
+    if '型号' in params:
+        param_xinghao = params['型号']
+    elif r'\t型号\t' in params:
+        param_xinghao = params[r'\t型号\t']
+    name_xinghao_lyst = list(filter(lambda x: len(x) >= 2, re.findall(r"[A-Za-z0-9-+/.*]+", name)))
+    if len(name_xinghao_lyst) == 0:
+        #type_lyst.append(param_xinghao.upper())
+        return param_xinghao.upper()
+    else:
+        if param_xinghao in name_xinghao_lyst:
+            #type_lyst.append(param_xinghao.upper())
+            return param_xinghao.upper()
+        else:
+            xinghao_data = max(name_xinghao_lyst, key=len)
+            for xinghao in name_xinghao_lyst:
+                if len(xinghao) > 2 and '*' not in xinghao and judge_unit(xinghao):
+                    xinghao_data = xinghao
+                    break
+            if not judge_unit(xinghao_data):
+                xinghao_data == 'NA'
+            #type_lyst.append(xinghao_data.upper())
+            return xinghao_data.upper()
+
+def param_load(product_id, xml_string):
+    """
+    传入sku，和xml原始代码
+    :param product_id:sku
+    :param xml_string:xml数据
+    :return:csv
+    """
+    xml_str = etree.HTML(xml_string)
+    #title = xml_str.xpath("//th[@class='tdTitle']")
+    secend = xml_str.xpath("//td[@class='tdTitle']")
+    zhi = xml_str.xpath("//tr//td[position()>1]")
+    data_dict = {}
+    for j, k in zip(secend, zhi):
+        #item = i.xpath("./text()")[0]
+        sec = j.xpath("./text()")[0]
+        value = k.xpath("./text()")[0]
+        data_dict[sec] = value
+    return data_dict
+
+if __name__ == '__main__':
+    sqlserver = sql_find('ZI_BAK', True)
+    sqlserver.cursor.execute("select * from ZI_Price_Quote where  productcode = '0506003750007'")
+    print(sqlserver.cursor.fetchall())
+
+    
--- a/lstm_predict.py
+++ b/lstm_predict.py
+#!/usr/bin/env python3
+# coding: utf-8
+# File: lstm_predict.py
+# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
+# Date: 18-5-23
+
+import numpy as np
+from keras import backend as K
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential,load_model
+from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
+from keras_contrib.layers.crf import CRF
+import matplotlib.pyplot as plt
+import os
+import pickle
+from keras import backend as K
+from dict_creator import dict_create
+
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '4'
+
+class LSTMNER:
+    def __init__(self, categorycode):
+        cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
+        self.train_path = os.path.join(cur, f'data/{categorycode}_biaozhu.txt')
+        self.vocab_path = os.path.join(cur, f'model/{categorycode}_vocab.txt')
+        self.embedding_file = os.path.join(cur, f'model/{categorycode}_vec_300.bin')
+        self.model_path = os.path.join(cur, f'model/{categorycode}_bilstm2_crf_model.h5')
+        self.word_dict = self.load_worddict()
+        if os.path.exists(f'data/param_dict/{categorycode}_param_dict.txt'):
+            self.class_dict = pickle.load(open(f'data/param_dict/{categorycode}_param_dict.txt','rb'))
+        else:
+            self.class_dict = dict_create(categorycode)
+        self.label_dict = {j:i for i,j in self.class_dict.items()}
+        self.EMBEDDING_DIM = 300
+        self.EPOCHS = 10
+        self.BATCH_SIZE = 128
+        self.NUM_CLASSES = len(self.class_dict)
+        self.VOCAB_SIZE = len(self.word_dict)+1
+        self.TIME_STAMPS = 150
+        self.embedding_matrix = self.build_embedding_matrix()
+        self.model = self.tokenvec_bilstm2_crf_model()
+        self.model.load_weights(self.model_path)
+
+    '加载词表'
+    def load_worddict(self):
+        vocabs = [line.decode().strip() for line in open(self.vocab_path,'rb')]
+        word_dict = {wd: index for index, wd in enumerate(vocabs)}
+        return word_dict
+
+    '''构造输入，转换成所需形式'''
+    def build_input(self, text):
+        x = []
+        for char in text:
+            if char not in self.word_dict:
+                char = 'UNK'
+            x.append(self.word_dict.get(char))
+        x = pad_sequences([x], self.TIME_STAMPS)
+        return x
+
+    def predict(self, text):
+        str = self.build_input(text)
+        raw = self.model.predict(str)[0][-self.TIME_STAMPS:]
+        result = [np.argmax(row) for row in raw]
+        chars = [i for i in text]
+        tags = [self.label_dict[i] for i in result][len(result)-len(text):]
+        res = list(zip(chars, tags))
+        #print(res)
+        return res
+
+    '''加载预训练词向量'''
+    def load_pretrained_embedding(self):
+        embeddings_dict = {}
+        with open(self.embedding_file, 'rb') as f:
+            for line in f:
+                line  = line.decode()
+                values = line.strip().split(' ')
+                if len(values) < 300:
+                    continue
+                word = values[0]
+                coefs = np.asarray(values[1:], dtype='float32')
+                embeddings_dict[word] = coefs
+        print('Found %s word vectors.' % len(embeddings_dict))
+        return embeddings_dict
+
+    '''加载词向量矩阵'''
+    def build_embedding_matrix(self):
+        embedding_dict = self.load_pretrained_embedding()
+        embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
+        for word, i in self.word_dict.items():
+            embedding_vector = embedding_dict.get(word)
+            if embedding_vector is not None:
+                embedding_matrix[i] = embedding_vector
+
+        return embedding_matrix
+
+    '''使用预训练向量进行模型训练'''
+    def tokenvec_bilstm2_crf_model(self):
+        model = Sequential()
+        embedding_layer = Embedding(self.VOCAB_SIZE + 1,
+                                    self.EMBEDDING_DIM,
+                                    weights=[self.embedding_matrix],
+                                    input_length=self.TIME_STAMPS,
+                                    trainable=False,
+                                    mask_zero=True)
+        model.add(embedding_layer)
+        model.add(Bidirectional(LSTM(128, return_sequences=True)))
+        model.add(Dropout(0.5))
+        model.add(Bidirectional(LSTM(64, return_sequences=True)))
+        model.add(Dropout(0.5))
+        model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
+        crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
+        model.add(crf_layer)
+        model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
+        model.summary()
+        return model
+
+    def clean(self):
+        K.clear_session()
+        del self.model
+        del self.class_dict
+        del self.embedding_matrix
+        del self.word_dict
+
+    def param_extract(self, sentence):
+        senetnce = sentence.strip()
+        param_extract = self.predict(sentence)
+        param_dict = {}
+        for i in range(len(param_extract)):
+            if param_extract[i][1] == 'O':
+                continue
+            elif param_extract[i][1].split('-')[0] == 'I':
+                continue
+            else:
+                #print(param_extract[i][1])
+                param_key = param_extract[i][1].split('-')[1]
+                param_start = i
+                param_end = len(senetnce)
+                for j in range(i+1,len(param_extract)):
+                    #print(param_extract[j][1])
+                    if param_extract[j][1] == '0':
+                        param_end = j
+                        break
+                    elif param_extract[j][1].split('-')[0] != 'I':
+                        param_end = j
+                        break
+                param_dict[param_key] = sentence[param_start:param_end].strip()
+        return param_dict
+
+if __name__ == '__main__':
+    ner_0101 = LSTMNER('0101')
+    while 1:
+        a = input('请输入产品名称:')
+        #print(ner.param_extract('联想 ThinkPad E580 ThinkPad E580（02CD）15.6英寸轻薄窄边框笔记本电脑（i5-8250U 8G 256G PCIeSSD+1T 2G独显 FHD）黑色（计价单位：台）'))
+        print(ner_0101.param_extract(a))
\ No newline at end of file
--- a/lstm_train.py
+++ b/lstm_train.py
+#!/usr/bin/env python3
+# coding: utf-8
+# File: lstm_train.py
+# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
+# Date: 18-12-24
+
+import numpy as np
+from keras import backend as K
+from keras.preprocessing.sequence import pad_sequences
+from keras.models import Sequential
+from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
+from keras_contrib.layers.crf import CRF
+from sklearn.model_selection import StratifiedKFold
+import matplotlib.pyplot as plt
+import os
+import pickle
+from dict_creator import dict_create
+os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
+os.environ["CUDA_VISIBLE_DEVICES"]="0"
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+class LSTMNER:
+    def __init__(self, categorycode):
+        cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
+        self.train_path = os.path.join(cur, f'data/{categorycode}_biaozhu.txt')
+        self.vocab_path = os.path.join(cur, f'model/{categorycode}_vocab.txt')
+        self.embedding_file = os.path.join(cur, f'model/{categorycode}_vec_300.bin')
+        self.model_path = os.path.join(cur, f'model/{categorycode}_bilstm2_crf_model.h5')
+        self.datas, self.word_dict = self.build_data()
+        if os.path.exists(f'data/param_dict/{categorycode}_param_dict.txt'):
+            self.class_dict = pickle.load(open(f'data/param_dict/{categorycode}_param_dict.txt','rb'))
+        else:
+            self.class_dict = dict_create(categorycode)
+        self.EMBEDDING_DIM = 300
+        self.EPOCHS = 5
+        self.BATCH_SIZE = 128
+        self.NUM_CLASSES = len(self.class_dict)
+        self.VOCAB_SIZE = len(self.word_dict)
+        self.TIME_STAMPS = 150
+        self.embedding_matrix = self.build_embedding_matrix()
+
+    '''构造数据集'''
+    def build_data(self):
+        datas = []
+        sample_x = []
+        sample_y = []
+        vocabs = {'UNK'}
+        for line in open(self.train_path,'r',encoding='utf-8'):
+            line = line.rstrip().split(' ')
+            if not line:
+                continue
+            char = line[0]
+            if not char:
+                continue
+            cate = line[-1].replace('M','I').replace('E','I')
+            sample_x.append(char)
+            sample_y.append(cate)
+            vocabs.add(char)
+            m = 0
+            if char in ['。','?','!','！','？',';']:
+                datas.append([sample_x, sample_y])
+                sample_x = []
+                sample_y = []
+        word_dict = {wd:index for index, wd in enumerate(list(vocabs))}
+        self.write_file(list(vocabs), self.vocab_path)
+        return datas, word_dict
+
+    '''将数据转换成keras所需的格式'''
+    def modify_data(self):
+        x_train = [[self.word_dict[char] for char in data[0]] for data in self.datas]
+        y_train = [[self.class_dict[label] for label in data[1]] for data in self.datas]
+        x_train = pad_sequences(x_train, self.TIME_STAMPS)
+        y = pad_sequences(y_train, self.TIME_STAMPS)
+        y_train = np.expand_dims(y, 2)
+        return x_train, y_train
+
+    '''保存字典文件'''
+    def write_file(self, wordlist, filepath):
+        with open(filepath, 'w+',encoding='utf-8') as f:
+            f.write('\n'.join(wordlist))
+
+    '''加载预训练词向量'''
+    def load_pretrained_embedding(self):
+        embeddings_dict = {}
+        with open(self.embedding_file, 'r',encoding='utf-8') as f:
+            for line in f:
+                values = line.strip().split(' ')
+                if len(values) < 300:
+                    continue
+                word = values[0]
+                coefs = np.asarray(values[1:], dtype='float32')
+                embeddings_dict[word] = coefs
+        print('Found %s word vectors.' % len(embeddings_dict))
+        return embeddings_dict
+
+    '''加载词向量矩阵'''
+    def build_embedding_matrix(self):
+        embedding_dict = self.load_pretrained_embedding()
+        embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
+        for word, i in self.word_dict.items():
+            embedding_vector = embedding_dict.get(word)
+            if embedding_vector is not None:
+                embedding_matrix[i] = embedding_vector
+        return embedding_matrix
+
+    '''使用预训练向量进行模型训练'''
+    def tokenvec_bilstm2_crf_model(self):
+        model = Sequential()
+        embedding_layer = Embedding(self.VOCAB_SIZE + 1,
+                                    self.EMBEDDING_DIM,
+                                    weights=[self.embedding_matrix],
+                                    input_length=self.TIME_STAMPS,
+                                    trainable=False,
+                                    mask_zero=True)
+        model.add(embedding_layer)
+        model.add(Bidirectional(LSTM(128, return_sequences=True)))
+        model.add(Dropout(0.5))
+        model.add(Bidirectional(LSTM(64, return_sequences=True)))
+        model.add(Dropout(0.5))
+        model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
+        crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
+        model.add(crf_layer)
+        model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
+        model.summary()
+        return model
+
+    '''训练模型'''
+    def train_model(self):
+        x_train, y_train = self.modify_data()
+        model = self.tokenvec_bilstm2_crf_model()
+        #model.load_weights(self.model_path)
+        kfold = StratifiedKFold(n_splits=self.EPOCHS, shuffle=True)
+        kfold_index = len(x_train)*['']
+        epoch_num = 0
+        for train,test in kfold.split(kfold_index, kfold_index):
+            epoch_num += 1
+            print(f'交叉训练第{epoch_num}轮。')
+            model.fit(x_train[train], y_train[train], validation_data=(x_train[test],y_train[test]), batch_size=self.BATCH_SIZE, epochs=5)
+        model.save(self.model_path)
+        print('模型训练完毕！')
+        #self.draw_train(history)
+        #model.save(self.model_path)
+        return model
+
+    '''绘制训练曲线'''
+    def draw_train(self, history):
+        # Plot training & validation accuracy values
+        plt.plot(history.history['acc'])
+        plt.title('Model accuracy')
+        plt.ylabel('Accuracy')
+        plt.xlabel('Epoch')
+        plt.legend(['Train'], loc='upper left')
+        plt.show()
+        # Plot training & validation loss values
+        plt.plot(history.history['loss'])
+        plt.title('Model loss')
+        plt.ylabel('Loss')
+        plt.xlabel('Epoch')
+        plt.legend(['Train'], loc='upper left')
+        plt.show()
+        # 7836/7836 [==============================] - 205s 26ms/step - loss: 17.1782 - acc: 0.9624
+        '''
+        6268/6268 [==============================] - 145s 23ms/step - loss: 18.5272 - acc: 0.7196 - val_loss: 15.7497 - val_acc: 0.8109
+        6268/6268 [==============================] - 142s 23ms/step - loss: 17.8446 - acc: 0.9099 - val_loss: 15.5915 - val_acc: 0.8378
+        6268/6268 [==============================] - 136s 22ms/step - loss: 17.7280 - acc: 0.9485 - val_loss: 15.5570 - val_acc: 0.8364
+        6268/6268 [==============================] - 133s 21ms/step - loss: 17.6918 - acc: 0.9593 - val_loss: 15.5187 - val_acc: 0.8451
+        6268/6268 [==============================] - 144s 23ms/step - loss: 17.6723 - acc: 0.9649 - val_loss: 15.4944 - val_acc: 0.8451
+        '''
+
+if __name__ == '__main__':
+    ner = LSTMNER('0101')
+    ner.train_model()
\ No newline at end of file
--- a/model/0101_bilstm2_crf_model.h5
+++ b/model/0101_bilstm2_crf_model.h5
--- a/model/0101_vec_300.bin
+++ b/model/0101_vec_300.bin
--- a/model/0101_vocab.txt
+++ b/model/0101_vocab.txt
+尊
+软
+W
+魂
+x
+框
+加
+春
+隐
+想
+防
+六
+奔
+散
+火
+驱
+绘
+边
+厂
+I
+双
+裸
+游
+玩
+下
+2
+）
+E
+棕
+记
+面
+升
+不
+机
+旗
+九
+摸
+大
+用
+生
+未
+存
+属
+Z
+金
+米
+T
+舟
+掠
+追
+东
+通
+B
+等
+V
+技
+潮
+*
+样
+显
+多
+或
+七
+操
+保
+舰
+锋
+铂
+压
+享
+发
+轴
+8
+吃
+持
+端
+礼
+三
+r
+进
+潜
+态
+狂
+枪
+芯
+纪
+泰
+士
+学
+速
+救
+荣
+女
+航
+翼
+扩
+k
+
+念
+系
+活
+板
+骑
+际
+刃
+Q
+准
+本
+"
+%
+号
+克
+惠
+焰
+设
+架
+苹
+奢
+傲
+量
+宽
+来
+眼
+星
+7
+全
+套
+置
+R
+请
+畅
+F
+道
+器
+适
+商
+将
+膜
+接
+C
+制
+平
+P
+非
+空
+槟
+于
+球
+雾
+效
+口
+i
+)
+务
+识
+红
+冷
+耳
+和
+彩
+灵
+6
+兽
+银
+率
+幕
+之
+集
+华
+冰
+码
+统
+拯
+能
+央
+控
+翻
+白
+卡
+刺
+a
+影
+中
+装
+主
+为
+品
+阶
+石
+;
+炎
+一
+室
+钴
+途
+形
+S
+精
+U
+理
+热
+皇
+质
+价
+联
+太
+0
+硬
+超
+核
+烧
+K
+款
+整
+英
+立
+家
+区
+钛
+9
+斧
+键
+魔
+眩
+原
+叠
+拔
+?
+飞
+踪
+备
+昭
+站
+头
+深
+业
+n
+片
+分
+域
+幽
+云
+定
+两
+粉
+线
+坞
+e
+、
+合
+嘉
+赠
+鸡
+胜
+巨
+：
+H
+普
+
+师
+J
+客
+3
+典
+绝
+D
+果
+色
+绿
+天
+勿
+示
+川
+-
+经
+处
+级
+玄
+炫
+钢
+灰
+包
+c
+暗
+独
+寸
+上
+组
+N
+功
+霸
+光
+护
+A
+纹
+気
+Ⅱ
+展
+无
+度
+式
+固
+5
+础
+　
+图
+快
+赢
+触
+b
+背
+频
+伪
+'
+微
+盘
+锐
+碁
+长
+刀
+X
+渲
+硕
+肩
+队
+t
+盾
+戏
+霆
+修
+成
+炬
+外
+酒
+鼠
+四
+脑
+染
+l
+限
+幻
+夺
+盒
+黑
+笔
+必
+m
+八
+晶
+动
+纯
+货
+芒
+垒
+:
+灭
+携
+河
+办
+】
+野
+章
+解
+侠
+宏
+储
+阳
+支
+灯
+按
+青
+意
+睿
+身
+预
+蓝
+第
+【
+电
+刷
+雷
+期
+闪
+窄
+神
+f
+续
+低
+革
+g
+亮
+z
+别
+堡
+耀
+逸
+战
+移
+新
+G
+公
+插
+池
+鱼
+五
+龙
+拓
+者
+武
+扬
+蛇
+正
+门
+智
+（
+.
+小
+配
+陨
+冲
+竞
+类
+像
+v
+转
+官
+特
+p
+容
+尔
+毁
+标
+赛
+清
+族
+屏
+基
+单
+同
+内
+就
+方
+h
+国
+鬼
+元
+s
+杀
+d
+y
+争
+勋
+信
+L
+行
+二
+个
+/
+可
+创
+年
+折
+曜
+售
+便
+w
+含
+墨
+锁
+位
+4
+轻
+“
+UNK
+代
+京
+州
+手
+尺
+勇
+薄
+海
+桌
+	
+摄
+型
+命
+安
+戴
+庭
+列
+试
+M
+匣
+极
+械
+人
+台
+u
+烈
+腾
+测
+O
+香
+忌
+购
+酷
+工
+高
+1
+带
+版
+网
+o
+专
+提
+改
+水
+埃
+作
+Y
+威
+牙
+越
+计
+(
+指
+坦
\ No newline at end of file
--- a/transfer_data.py
+++ b/transfer_data.py
+#!/usr/bin/env python3
+# coding: utf-8
+# File: transfer_data.py
+# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
+# Date: 18-12-24
+
+import os
+from collections import Counter
+
+class TransferData:
+    def __init__(self):
+        cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
+        self.label_dict = {
+                      '检查和检验': 'CHECK',
+                      '症状和体征': 'SIGNS',
+                      '疾病和诊断': 'DISEASE',
+                      '治疗': 'TREATMENT',
+                      '身体部位': 'BODY'}
+
+        self.cate_dict ={
+                         'O':0,
+                         'TREATMENT-I': 1,
+                         'TREATMENT-B': 2,
+                         'BODY-B': 3,
+                         'BODY-I': 4,
+                         'SIGNS-I': 5,
+                         'SIGNS-B': 6,
+                         'CHECK-B': 7,
+                         'CHECK-I': 8,
+                         'DISEASE-I': 9,
+                         'DISEASE-B': 10
+                        }
+        self.origin_path = os.path.join(cur, 'data_origin')
+        self.train_filepath = os.path.join(cur, 'train.txt')
+        return
+
+
+    def transfer(self):
+        f = open(self.train_filepath, 'w+')
+        count = 0
+        for root,dirs,files in os.walk(self.origin_path):
+            for file in files:
+                filepath = os.path.join(root, file)
+                if 'original' not in filepath:
+                    continue
+                label_filepath = filepath.replace('.txtoriginal','')
+                print(filepath, '\t\t', label_filepath)
+                content = open(filepath).read().strip()
+                res_dict = {}
+                for line in open(label_filepath):
+                    res = line.strip().split('	')
+                    start = int(res[1])
+                    end = int(res[2])
+                    label = res[3]
+                    label_id = self.label_dict.get(label)
+                    for i in range(start, end+1):
+                        if i == start:
+                            label_cate = label_id + '-B'
+                        else:
+                            label_cate = label_id + '-I'
+                        res_dict[i] = label_cate
+
+                for indx, char in enumerate(content):
+                    char_label = res_dict.get(indx, 'O')
+                    print(char, char_label)
+                    f.write(char + '\t' + char_label + '\n')
+        f.close()
+        return
+
+
+
+if __name__ == '__main__':
+    handler = TransferData()
+    train_datas = handler.transfer()
\ No newline at end of file
--- a/w2v.py
+++ b/w2v.py
+# -*- coding:utf-8 -*-
+import numpy as np
+from gensim.models import word2vec
+import multiprocessing
+import torch.nn.functional as F
+import os
+
+
+def w2v_train(segment_dir = './data/segment/oil.txt', word2vec_path = './models/w2v/oil.model'):
+    sentences = word2vec.PathLineSentences(segment_dir)
+    model2 = train_wordVectors(sentences, embedding_size=300, window=5, min_count=1)
+    save_wordVectors(model2, word2vec_path)
+
+
+def load_wordVectors(word2vec_path):
+    w2vModel = word2vec.Word2Vec.load(word2vec_path)
+    return w2vModel
+
+def train_wordVectors(sentences, embedding_size = 300, window = 5, min_count = 5):
+    w2vModel = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count(),iter=10,hs=1)
+    return w2vModel
+
+def save_wordVectors(w2vModel,word2vec_path):
+    w2vModel.save(word2vec_path)
+
--- a/数据准备.py
+++ b/数据准备.py
+# -*- coding: utf-8 -*-
+
+import pymssql
+import pandas as pd
+from lxml import etree
+import requests
+import json
+from function import *
+import pickle
+import decimal
+import time
+from w2v import *
+
+def index_of_str(seq, sub_seq):
+    seq = ''.join(seq)
+    index=[]
+    n1=len(seq)
+    n2=len(sub_seq)
+    for i in range(n1-n2+1):
+        if seq[i:i+n2]==sub_seq:
+            index.append(i)
+    return index
+SubCategoryCode = '0101'
+
+sql_ZIdatabase = sql_find('ZI_DataBase', False)
+sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where ispeijian = '1' and SubCategoryCode = '{SubCategoryCode}'")
+subtitle_lyst = [str(f"'{x[0]}'") for x in sql_ZIdatabase.cursor.fetchall()]
+subtitle_lyst = ','.join(subtitle_lyst)
+sql_ZIdatabase.cursor.execute(f"select ProductName,参数名称,参数值 from vw_productValue where SubCategoryCode = '{SubCategoryCode}' and 参数名称 in ({subtitle_lyst}) and ProductName not like '%wrong'")
+data = sql_ZIdatabase.cursor.fetchall()
+data = pd.DataFrame(data,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
+################################################################
+f = open('Temporaryfolder/name_data.txt','w',encoding='utf-8')
+for name in set(data['ProductName'].tolist()):
+    f.write(f'{name}\n')
+f.close()
+###############################################################
+data = data.drop(['ProductName'], axis=1)
+data = data.drop_duplicates()
+data.to_excel(f'Temporaryfolder/{SubCategoryCode}_train_data.xlsx')
+################################################################
+f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
+g = open('Temporaryfolder/name_data_w2v.txt','w',encoding='utf-8')
+for line in f:
+    line = line.replace(' ','')
+    line = ' '.join(list(line))
+    g.write(line)
+f.close()
+g.close()
+
+################################################################
+f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
+g = open(f'model/{SubCategoryCode}_vec_300.txt','w',encoding='utf-8')
+w2v_train('Temporaryfolder/name_data_w2v.txt', f'{SubCategoryCode}.bin')
+model_w2v = load_wordVectors(f'{SubCategoryCode}.bin')
+word_data = []
+for line in f:
+    line = line.replace(' ','')
+    line = list(line)
+    for word in line:
+        word_data.append(word)
+print(len(word_data))
+word_data = set(word_data)
+word_data.remove('\u3000')
+word_data.remove('\n')
+print(len(word_data))
+for word in word_data:
+    g.write(f"{word} {' '.join([str(x) for x in model_w2v[word].tolist()])}\n")
+g.close()
+f.close()
+
+################################################################
+f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
+g = open(f'data/{SubCategoryCode}_train.txt','w',encoding='utf-8')
+model_w2v = load_wordVectors('0101.bin')
+word_data = []     #字序列
+m = 0
+for line in f:
+    m += 1
+    #line = line.replace(' ','')
+    line = list(line)
+    line.append(';')
+    for word in line:
+        if word != '\n':
+            word_data.append(word)
+sign_data = []     #标记序列 
+for i in range(len(word_data)):
+    sign_data.append('O')
+table = pd.read_excel(f'{SubCategoryCode}_train_data.xlsx')
+o = 0
+for param_name, param_value in zip(table['参数名称'],table['参数值']):
+    print(o,end = '\r')
+    o += 1
+    param_str_len = len(param_value)
+    if param_str_len < 2:
+        continue
+    param_value = param_value.upper()
+    sign_list = index_of_str(word_data, param_value)
+    for n in sign_list:
+        sign_data[n] = f'{param_name}-B'
+        for j in range(param_str_len-1):
+            sign_data[n+j+1] = f'{param_name}-I'
+for word,sign in zip(word_data,sign_data):
+    g.write(f'{word}\t{sign}\n')
+f.close()
+g.close()
+
+#将生成的"子类编码_vec_300.txt"放入model作为词向量，将生成的"子类编码_train.txt"放入data作为训练集。
\ No newline at end of file