for excel

fc402f95 · sanlu · 7e18ef67 · 7e18ef67 · 7e18ef67 · 7e18ef67
Commit fc402f95 authored Dec 06, 2019 by sanlu
19 changed files
--- a/.idea/illness_entity_recognize.iml
+++ b/.idea/illness_entity_recognize.iml
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-  <component name="TestRunnerService">
-    <option name="projectConfiguration" value="Nosetests" />
-    <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
-  </component>
-</module>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.3 (~/anaconda3/bin/python)" project-jdk-type="Python SDK" />
-</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/illness_entity_recognize.iml" filepath="$PROJECT_DIR$/.idea/illness_entity_recognize.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
--- a/LXWL_参数对应表_2019-12-05.xlsx
+++ b/LXWL_参数对应表_2019-12-05.xlsx
--- a/README.md
+++ b/README.md
-# MedicalNamedEntityRecognition
-Medical Named Entity Recognition implement using bi-directional lstm and crf  model with char embedding.CCKS2017中文电子病例命名实体识别项目,主要实现使用了基于字向量的四层双向LSTM与CRF模型的网络.该项目提供了原始训练数据样本(一般醒目,出院情况,病史情况,病史特点,诊疗经过)与转换版本,训练脚本,预训练模型,可用于序列标注研究.把玩和PK使用.
-
-# 项目介绍
-电子病历结构化是让计算机理解病历、应用病历的基础。基于对病历的结构化，可以计算出症状、疾病、药品、检查检验等多个知识点之间的关系及其概率，构建医疗领域的知识图谱，进一步优化医生的工作.
-CCKS2018的电子病历命名实体识别的评测任务，是对于给定的一组电子病历纯文本文档，识别并抽取出其中与医学临床相关的实体，并将它们归类到预先定义好的类别中。组委会针对这个评测任务，提供了600份标注好的电子病历文本，共需识别含解剖部位、独立症状、症状描述、手术和药物五类实体。
-领域命名实体识别问题自然语言处理中经典的序列标注问题, 本项目是运用深度学习方法进行命名实体识别的一个尝试.
-
-# 实验数据
-一, 目标序列标记集合
-    O非实体部分,TREATMENT治疗方式, BODY身体部位, SIGN疾病症状, CHECK医学检查, DISEASE疾病实体,
-二, 序列标记方法
-    采用BIO三元标记
-
-
-    self.class_dict ={
-                     'O':0,
-                     'TREATMENT-I': 1,
-                     'TREATMENT-B': 2,
-                     'BODY-B': 3,
-                     'BODY-I': 4,
-                     'SIGNS-I': 5,
-                     'SIGNS-B': 6,
-                     'CHECK-B': 7,
-                     'CHECK-I': 8,
-                     'DISEASE-I': 9,
-                     'DISEASE-B': 10
-                    }
-
-
-
-三, 数据转换
-    评测方提供了四个目录(一般项目, 出院项目, 病史特点, 诊疗经过),四个目录下有txtoriginal文件和txt标注文件,内容样式如下:
-
-一般项目-1.txtoriginal.txt
-
-    女性，88岁，农民，双滦区应营子村人，主因右髋部摔伤后疼痛肿胀，活动受限5小时于2016-10-29；11：12入院。
-一般项目-1.txt:
-
-    右髋部	21	23	身体部位
-    疼痛	27	28	症状和体征
-    肿胀	29	30	症状和体征
-
-转换脚本函数:
-
-      def transfer(self):
-        f = open(self.train_filepath, 'w+')
-        count = 0
-        for root,dirs,files in os.walk(self.origin_path):
-            for file in files:
-                filepath = os.path.join(root, file)
-                if 'original' not in filepath:
-                    continue
-                label_filepath = filepath.replace('.txtoriginal','')
-                print(filepath, '\t\t', label_filepath)
-                content = open(filepath).read().strip()
-                res_dict = {}
-                for line in open(label_filepath):
-                    res = line.strip().split('	')
-                    start = int(res[1])
-                    end = int(res[2])
-                    label = res[3]
-                    label_id = self.label_dict.get(label)
-                    for i in range(start, end+1):
-                        if i == start:
-                            label_cate = label_id + '-B'
-                        else:
-                            label_cate = label_id + '-I'
-                        res_dict[i] = label_cate
-
-                for indx, char in enumerate(content):
-                    char_label = res_dict.get(indx, 'O')
-                    print(char, char_label)
-                    f.write(char + '\t' + char_label + '\n')
-        f.close()
-        return
-
-模型输出样式:
-
-    ，	O
-    男	O
-    ，	O
-    双	O
-    塔	O
-    山	O
-    人	O
-    ，	O
-    主	O
-    因	O
-    咳	SIGNS-B
-    嗽	SIGNS-I
-    、	O
-    少	SIGNS-B
-    痰	SIGNS-I
-    1	O
-    个	O
-    月	O
-    ，	O
-    加	O
-    重	O
-    3	O
-    天	O
-    ，	O
-    抽	SIGNS-B
-    搐	SIGNS-I
-
-
-# 模型搭建
-   本模型使用预训练字向量,作为embedding层输入,然后经过两个双向LSTM层进行编码,编码后加入dense层,最后送入CRF层进行序列标注.
-
-       '''使用预训练向量进行模型训练'''
-    def tokenvec_bilstm2_crf_model(self):
-        model = Sequential()
-        embedding_layer = Embedding(self.VOCAB_SIZE + 1,
-                                    self.EMBEDDING_DIM,
-                                    weights=[self.embedding_matrix],
-                                    input_length=self.TIME_STAMPS,
-                                    trainable=False,
-                                    mask_zero=True)
-        model.add(embedding_layer)
-        model.add(Bidirectional(LSTM(128, return_sequences=True)))
-        model.add(Dropout(0.5))
-        model.add(Bidirectional(LSTM(64, return_sequences=True)))
-        model.add(Dropout(0.5))
-        model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
-        crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
-        model.add(crf_layer)
-        model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
-        model.summary()
-        return model
-
-
-# 模型效果  
-1, 模型的训练:  
-
-   | 模型 | 训练集 | 测试集 |训练集准确率 |测试集准确率 |备注|
-   | :--- | :---: | :---: | :--- |:--- |:--- |
-   | 医疗实体识别 | 6268 | 1571| 0.9649|0.8451|5个epcho|
-
-
-2, 模型的测试:
-    python lstm_predict.py, 对训练好的实体识别模型进行测试,测试效果如下:
-
-        enter an sent:他最近头痛,流鼻涕,估计是发烧了
-        [('他', 'O'), ('最', 'O'), ('近', 'O'), ('头', 'SIGNS-B'), ('痛', 'SIGNS-I'), (',', 'O'), ('流', 'O'), ('鼻', 'O'), ('涕', 'O'), (',', 'O'), ('估', 'O'), ('计', 'O'), ('是', 'O'), ('发', 'SIGNS-B'), ('烧', 'SIGNS-I'), ('了', 'SIGNS-I')]
-        enter an sent:口腔溃疡可能需要多吃维生素
-        [('口', 'BODY-B'), ('腔', 'BODY-I'), ('溃', 'O'), ('疡', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('多', 'O'), ('吃', 'O'), ('维', 'CHECK-B'), ('生', 'CHECK-B'), ('素', 'TREATMENT-I')]
-        enter an sent:他骨折了,可能需要拍片
-        [('他', 'O'), ('骨', 'SIGNS-B'), ('折', 'SIGNS-I'), ('了', 'O'), (',', 'O'), ('可', 'O'), ('能', 'O'), ('需', 'O'), ('要', 'O'), ('拍', 'O'), ('片', 'CHECK-I')]
-
-# 总结    
-1,本项目针对中文电子病例命名实体任务,实现了一个基于Bilstm+CRF的命名实体识别模型  
-2,本项目使用charembedding作为原始特征,训练集准确率为0.9649,测试集准确达到0.8451  
-3,命名实体识别可以加入更多的特征进行训练,后期将逐步实验其他方式.  
-
-# contact 
-如有自然语言处理、知识图谱、事理图谱、社会计算、语言资源建设等问题或合作，请联系我:  
-邮箱:lhy_in_blcu@126.com  
-csdn:https://blog.csdn.net/lhy2014  
-我的自然语言处理项目: https://liuhuanyong.github.io/  
-刘焕勇，中国科学院软件研究所  
-
-
-
-
-
-
-
-
-
-
--- a/ZH_LXWL_data_2019-11-21.xlsx
+++ b/ZH_LXWL_data_2019-11-21.xlsx
--- a/__pycache__/dict_creator.cpython-36.pyc
+++ b/__pycache__/dict_creator.cpython-36.pyc
--- a/__pycache__/function.cpython-36.pyc
+++ b/__pycache__/function.cpython-36.pyc
--- a/__pycache__/lstm_predict.cpython-36.pyc
+++ b/__pycache__/lstm_predict.cpython-36.pyc
--- a/__pycache__/w2v.cpython-36.pyc
+++ b/__pycache__/w2v.cpython-36.pyc
--- a/combine.xlsx
+++ b/combine.xlsx
--- a/lstm_predict.py
+++ b/lstm_predict.py
@@ -122,7 +122,7 @@ class LSTMNER:
        del self.word_dict

    def param_extract(self, sentence):
-        senetnce = sentence.strip()
+        senetnce = sentence.strip().upper()
        param_extract = self.predict(sentence)
        param_dict = {}
        for i in range(len(param_extract)):
@@ -143,7 +143,8 @@ class LSTMNER:
                    elif param_extract[j][1].split('-')[0] != 'I':
                        param_end = j
                        break
-                param_dict[param_key] = sentence[param_start:param_end].strip()
+                if not param_key in param_dict.keys():
+                    param_dict[param_key] = sentence[param_start:param_end].strip()
        return param_dict

 if __name__ == '__main__':

--- a/lstm_train.py
+++ b/lstm_train.py
-#!/usr/bin/env python3
-# coding: utf-8
-# File: lstm_train.py
-# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
-# Date: 18-12-24
-
-import numpy as np
-from keras import backend as K
-from keras.preprocessing.sequence import pad_sequences
-from keras.models import Sequential
-from keras.layers import Embedding, Bidirectional, LSTM, Dense, TimeDistributed, Dropout
-from keras_contrib.layers.crf import CRF
-from sklearn.model_selection import StratifiedKFold
-import matplotlib.pyplot as plt
-import os
-import pickle
-from dict_creator import dict_create
-os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
-os.environ["CUDA_VISIBLE_DEVICES"]="0"
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-
-class LSTMNER:
-    def __init__(self, categorycode):
-        cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
-        self.train_path = os.path.join(cur, f'data/{categorycode}_biaozhu.txt')
-        self.vocab_path = os.path.join(cur, f'model/{categorycode}_vocab.txt')
-        self.embedding_file = os.path.join(cur, f'model/{categorycode}_vec_300.bin')
-        self.model_path = os.path.join(cur, f'model/{categorycode}_bilstm2_crf_model.h5')
-        self.datas, self.word_dict = self.build_data()
-        if os.path.exists(f'data/param_dict/{categorycode}_param_dict.txt'):
-            self.class_dict = pickle.load(open(f'data/param_dict/{categorycode}_param_dict.txt','rb'))
-        else:
-            self.class_dict = dict_create(categorycode)
-        self.EMBEDDING_DIM = 300
-        self.EPOCHS = 5
-        self.BATCH_SIZE = 128
-        self.NUM_CLASSES = len(self.class_dict)
-        self.VOCAB_SIZE = len(self.word_dict)
-        self.TIME_STAMPS = 150
-        self.embedding_matrix = self.build_embedding_matrix()
-
-    '''构造数据集'''
-    def build_data(self):
-        datas = []
-        sample_x = []
-        sample_y = []
-        vocabs = {'UNK'}
-        for line in open(self.train_path,'r',encoding='utf-8'):
-            line = line.rstrip().split(' ')
-            if not line:
-                continue
-            char = line[0]
-            if not char:
-                continue
-            cate = line[-1].replace('M','I').replace('E','I')
-            sample_x.append(char)
-            sample_y.append(cate)
-            vocabs.add(char)
-            m = 0
-            if char in ['。','?','!','！','？',';']:
-                datas.append([sample_x, sample_y])
-                sample_x = []
-                sample_y = []
-        word_dict = {wd:index for index, wd in enumerate(list(vocabs))}
-        self.write_file(list(vocabs), self.vocab_path)
-        return datas, word_dict
-
-    '''将数据转换成keras所需的格式'''
-    def modify_data(self):
-        x_train = [[self.word_dict[char] for char in data[0]] for data in self.datas]
-        y_train = [[self.class_dict[label] for label in data[1]] for data in self.datas]
-        x_train = pad_sequences(x_train, self.TIME_STAMPS)
-        y = pad_sequences(y_train, self.TIME_STAMPS)
-        y_train = np.expand_dims(y, 2)
-        return x_train, y_train
-
-    '''保存字典文件'''
-    def write_file(self, wordlist, filepath):
-        with open(filepath, 'w+',encoding='utf-8') as f:
-            f.write('\n'.join(wordlist))
-
-    '''加载预训练词向量'''
-    def load_pretrained_embedding(self):
-        embeddings_dict = {}
-        with open(self.embedding_file, 'r',encoding='utf-8') as f:
-            for line in f:
-                values = line.strip().split(' ')
-                if len(values) < 300:
-                    continue
-                word = values[0]
-                coefs = np.asarray(values[1:], dtype='float32')
-                embeddings_dict[word] = coefs
-        print('Found %s word vectors.' % len(embeddings_dict))
-        return embeddings_dict
-
-    '''加载词向量矩阵'''
-    def build_embedding_matrix(self):
-        embedding_dict = self.load_pretrained_embedding()
-        embedding_matrix = np.zeros((self.VOCAB_SIZE + 1, self.EMBEDDING_DIM))
-        for word, i in self.word_dict.items():
-            embedding_vector = embedding_dict.get(word)
-            if embedding_vector is not None:
-                embedding_matrix[i] = embedding_vector
-        return embedding_matrix
-
-    '''使用预训练向量进行模型训练'''
-    def tokenvec_bilstm2_crf_model(self):
-        model = Sequential()
-        embedding_layer = Embedding(self.VOCAB_SIZE + 1,
-                                    self.EMBEDDING_DIM,
-                                    weights=[self.embedding_matrix],
-                                    input_length=self.TIME_STAMPS,
-                                    trainable=False,
-                                    mask_zero=True)
-        model.add(embedding_layer)
-        model.add(Bidirectional(LSTM(128, return_sequences=True)))
-        model.add(Dropout(0.5))
-        model.add(Bidirectional(LSTM(64, return_sequences=True)))
-        model.add(Dropout(0.5))
-        model.add(TimeDistributed(Dense(self.NUM_CLASSES)))
-        crf_layer = CRF(self.NUM_CLASSES, sparse_target=True)
-        model.add(crf_layer)
-        model.compile('adam', loss=crf_layer.loss_function, metrics=[crf_layer.accuracy])
-        model.summary()
-        return model
-
-    '''训练模型'''
-    def train_model(self):
-        x_train, y_train = self.modify_data()
-        model = self.tokenvec_bilstm2_crf_model()
-        #model.load_weights(self.model_path)
-        kfold = StratifiedKFold(n_splits=self.EPOCHS, shuffle=True)
-        kfold_index = len(x_train)*['']
-        epoch_num = 0
-        for train,test in kfold.split(kfold_index, kfold_index):
-            epoch_num += 1
-            print(f'交叉训练第{epoch_num}轮。')
-            model.fit(x_train[train], y_train[train], validation_data=(x_train[test],y_train[test]), batch_size=self.BATCH_SIZE, epochs=5)
-        model.save(self.model_path)
-        print('模型训练完毕！')
-        #self.draw_train(history)
-        #model.save(self.model_path)
-        return model
-
-    '''绘制训练曲线'''
-    def draw_train(self, history):
-        # Plot training & validation accuracy values
-        plt.plot(history.history['acc'])
-        plt.title('Model accuracy')
-        plt.ylabel('Accuracy')
-        plt.xlabel('Epoch')
-        plt.legend(['Train'], loc='upper left')
-        plt.show()
-        # Plot training & validation loss values
-        plt.plot(history.history['loss'])
-        plt.title('Model loss')
-        plt.ylabel('Loss')
-        plt.xlabel('Epoch')
-        plt.legend(['Train'], loc='upper left')
-        plt.show()
-        # 7836/7836 [==============================] - 205s 26ms/step - loss: 17.1782 - acc: 0.9624
-        '''
-        6268/6268 [==============================] - 145s 23ms/step - loss: 18.5272 - acc: 0.7196 - val_loss: 15.7497 - val_acc: 0.8109
-        6268/6268 [==============================] - 142s 23ms/step - loss: 17.8446 - acc: 0.9099 - val_loss: 15.5915 - val_acc: 0.8378
-        6268/6268 [==============================] - 136s 22ms/step - loss: 17.7280 - acc: 0.9485 - val_loss: 15.5570 - val_acc: 0.8364
-        6268/6268 [==============================] - 133s 21ms/step - loss: 17.6918 - acc: 0.9593 - val_loss: 15.5187 - val_acc: 0.8451
-        6268/6268 [==============================] - 144s 23ms/step - loss: 17.6723 - acc: 0.9649 - val_loss: 15.4944 - val_acc: 0.8451
-        '''
-
-if __name__ == '__main__':
-    ner = LSTMNER('0101')
-    ner.train_model()
\ No newline at end of file
--- a/out/0101_param_extract.xlsx
+++ b/out/0101_param_extract.xlsx
--- a/param_add.py
+++ b/param_add.py
+import pandas as pd
+from lstm_predict import LSTMNER
+from function import *
+import time
+import os
+
+
+#读取数据
+data_table = pd.read_excel('combine.xlsx', converters={'SKU':str,'指数子类编码':str,'指数品牌编码':str})
+channel = 'LXWL'
+now_time = time.strftime("%Y-%m-%d", time.localtime())
+sql_ZIdatabase = sql_find('ZI_DataBase', False)
+
+if os.path.isfile(f'{channel}_参数对应表_{now_time}.xlsx'):
+    merge_data = pd.read_excel(f'{channel}_参数对应表_{now_time}.xlsx', converters={'ZI_SubCategoryCode':str})
+else:
+    sql_ZIdatabase.cursor.execute(f"select * from Product_Relation_Attribute_SubTitle where Source = '{channel}'")
+    match_data = sql_ZIdatabase.cursor.fetchall()
+    match_data = pd.DataFrame(match_data,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
+
+    sql_ZIdatabase.cursor.execute('select * from vw_relation_property')
+    param_data_table = sql_ZIdatabase.cursor.fetchall()
+    param_data_table = pd.DataFrame(param_data_table,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
+
+    merge_data = pd.merge(param_data_table,match_data[['ZI_SubTitle','Other_SubTitle','ZI_SubCategoryCode']],left_on=['SubCategoryCode','SubTitle'], right_on=['ZI_SubCategoryCode','ZI_SubTitle'])
+    merge_data = merge_data.drop_duplicates().reset_index()
+    merge_data = merge_data[(merge_data['ZI_SubTitle'].isnull() == False) | (merge_data['ISimportant'] == 1) | (merge_data['ispeijian'] == 1)]
+    merge_data.to_excel(f'{channel}_参数对应表_{now_time}.xlsx')
+
+category_list = set(data_table['指数子类编码'].tolist())
+for categorycode in category_list:
+    model = LSTMNER(categorycode)
+    sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')")
+    param_standard_lyst = [x[0] for x in sql_ZIdatabase.cursor.fetchall()]
+    param_standard_dict = {}
+    for param_standard in param_standard_lyst:
+        param_standard_dict[param_standard] = []
+
+    param_table = merge_data[(merge_data['ZI_SubCategoryCode'] == categorycode) & ((merge_data['ISimportant'] == 1) | (merge_data['ispeijian'] == 1))]
+    param_dict = {}
+    ZI_subtitle_list = []
+    for other_subtitle,ZI_subtitle in zip(param_table['Other_SubTitle'],param_table['ZI_SubTitle']):
+        param_dict[other_subtitle] = ZI_subtitle                                   #非标转标词典
+        ZI_subtitle_list.append(ZI_subtitle)
+    data_categorycode = data_table[data_table['指数子类编码'] == categorycode]
+
+    for name,params in zip(data_categorycode['产品名称'],data_categorycode['参数项']):
+        product_param_dict = {}
+        params = eval(params)
+        for param_key in params.keys():
+            if '\t' in param_key or ' ' in param_key:
+                params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key)        #去除参数项的特殊符号
+        for param_key_2 in params.keys():
+            if param_key_2 in param_dict.keys():
+                product_param_dict[param_dict[param_key_2]] = params[param_key_2]
+            elif param_key_2 in ZI_subtitle_list:
+                product_param_dict[param_key_2] = params[param_key_2]
+            else:
+                pass
+        name_param_dict = model.param_extract(name)
+        for name_param_key in name_param_dict.keys():
+            product_param_dict[name_param_key] = name_param_dict[name_param_key]
+        for key in param_standard_dict:
+            try:
+                param_standard_dict[key].append(product_param_dict[key])
+            except:
+                param_standard_dict[key].append('')
+    model.clean()
+
+    for key in param_standard_dict:
+        data_categorycode[key] = param_standard_dict[key]
+    #data_categorycode['整合参数项'] = product_param_dict_lyst
+    data_categorycode.to_excel(f'out/{categorycode}_param_extract.xlsx')
--- a/test.py
+++ b/test.py
+from lstm_predict import LSTMNER
+
+model = LSTMNER('0101')
+model.param_extract('联想 ThinkPad E580 ThinkPad E580（02CD）15.6英寸轻薄窄边框笔记本电脑（i5-8250U 8G 256G PCIeSSD+1T 2G独显 FHD）黑色（计价单位：台）')
\ No newline at end of file
--- a/transfer_data.py
+++ b/transfer_data.py
-#!/usr/bin/env python3
-# coding: utf-8
-# File: transfer_data.py
-# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
-# Date: 18-12-24
-
-import os
-from collections import Counter
-
-class TransferData:
-    def __init__(self):
-        cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
-        self.label_dict = {
-                      '检查和检验': 'CHECK',
-                      '症状和体征': 'SIGNS',
-                      '疾病和诊断': 'DISEASE',
-                      '治疗': 'TREATMENT',
-                      '身体部位': 'BODY'}
-
-        self.cate_dict ={
-                         'O':0,
-                         'TREATMENT-I': 1,
-                         'TREATMENT-B': 2,
-                         'BODY-B': 3,
-                         'BODY-I': 4,
-                         'SIGNS-I': 5,
-                         'SIGNS-B': 6,
-                         'CHECK-B': 7,
-                         'CHECK-I': 8,
-                         'DISEASE-I': 9,
-                         'DISEASE-B': 10
-                        }
-        self.origin_path = os.path.join(cur, 'data_origin')
-        self.train_filepath = os.path.join(cur, 'train.txt')
-        return
-
-
-    def transfer(self):
-        f = open(self.train_filepath, 'w+')
-        count = 0
-        for root,dirs,files in os.walk(self.origin_path):
-            for file in files:
-                filepath = os.path.join(root, file)
-                if 'original' not in filepath:
-                    continue
-                label_filepath = filepath.replace('.txtoriginal','')
-                print(filepath, '\t\t', label_filepath)
-                content = open(filepath).read().strip()
-                res_dict = {}
-                for line in open(label_filepath):
-                    res = line.strip().split('	')
-                    start = int(res[1])
-                    end = int(res[2])
-                    label = res[3]
-                    label_id = self.label_dict.get(label)
-                    for i in range(start, end+1):
-                        if i == start:
-                            label_cate = label_id + '-B'
-                        else:
-                            label_cate = label_id + '-I'
-                        res_dict[i] = label_cate
-
-                for indx, char in enumerate(content):
-                    char_label = res_dict.get(indx, 'O')
-                    print(char, char_label)
-                    f.write(char + '\t' + char_label + '\n')
-        f.close()
-        return
-
-
-
-if __name__ == '__main__':
-    handler = TransferData()
-    train_datas = handler.transfer()
\ No newline at end of file
--- a/数据准备.py
+++ b/数据准备.py
-# -*- coding: utf-8 -*-
-
-import pymssql
-import pandas as pd
-from lxml import etree
-import requests
-import json
-from function import *
-import pickle
-import decimal
-import time
-from w2v import *
-
-def index_of_str(seq, sub_seq):
-    seq = ''.join(seq)
-    index=[]
-    n1=len(seq)
-    n2=len(sub_seq)
-    for i in range(n1-n2+1):
-        if seq[i:i+n2]==sub_seq:
-            index.append(i)
-    return index
-SubCategoryCode = '0101'
-
-sql_ZIdatabase = sql_find('ZI_DataBase', False)
-sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where ispeijian = '1' and SubCategoryCode = '{SubCategoryCode}'")
-subtitle_lyst = [str(f"'{x[0]}'") for x in sql_ZIdatabase.cursor.fetchall()]
-subtitle_lyst = ','.join(subtitle_lyst)
-sql_ZIdatabase.cursor.execute(f"select ProductName,参数名称,参数值 from vw_productValue where SubCategoryCode = '{SubCategoryCode}' and 参数名称 in ({subtitle_lyst}) and ProductName not like '%wrong'")
-data = sql_ZIdatabase.cursor.fetchall()
-data = pd.DataFrame(data,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
-################################################################
-f = open('Temporaryfolder/name_data.txt','w',encoding='utf-8')
-for name in set(data['ProductName'].tolist()):
-    f.write(f'{name}\n')
-f.close()
-###############################################################
-data = data.drop(['ProductName'], axis=1)
-data = data.drop_duplicates()
-data.to_excel(f'Temporaryfolder/{SubCategoryCode}_train_data.xlsx')
-################################################################
-f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
-g = open('Temporaryfolder/name_data_w2v.txt','w',encoding='utf-8')
-for line in f:
-    line = line.replace(' ','')
-    line = ' '.join(list(line))
-    g.write(line)
-f.close()
-g.close()
-
-################################################################
-f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
-g = open(f'model/{SubCategoryCode}_vec_300.txt','w',encoding='utf-8')
-w2v_train('Temporaryfolder/name_data_w2v.txt', f'{SubCategoryCode}.bin')
-model_w2v = load_wordVectors(f'{SubCategoryCode}.bin')
-word_data = []
-for line in f:
-    line = line.replace(' ','')
-    line = list(line)
-    for word in line:
-        word_data.append(word)
-print(len(word_data))
-word_data = set(word_data)
-word_data.remove('\u3000')
-word_data.remove('\n')
-print(len(word_data))
-for word in word_data:
-    g.write(f"{word} {' '.join([str(x) for x in model_w2v[word].tolist()])}\n")
-g.close()
-f.close()
-
-################################################################
-f = open('Temporaryfolder/name_data.txt','r',encoding='utf-8')
-g = open(f'data/{SubCategoryCode}_train.txt','w',encoding='utf-8')
-model_w2v = load_wordVectors('0101.bin')
-word_data = []     #字序列
-m = 0
-for line in f:
-    m += 1
-    #line = line.replace(' ','')
-    line = list(line)
-    line.append(';')
-    for word in line:
-        if word != '\n':
-            word_data.append(word)
-sign_data = []     #标记序列 
-for i in range(len(word_data)):
-    sign_data.append('O')
-table = pd.read_excel(f'{SubCategoryCode}_train_data.xlsx')
-o = 0
-for param_name, param_value in zip(table['参数名称'],table['参数值']):
-    print(o,end = '\r')
-    o += 1
-    param_str_len = len(param_value)
-    if param_str_len < 2:
-        continue
-    param_value = param_value.upper()
-    sign_list = index_of_str(word_data, param_value)
-    for n in sign_list:
-        sign_data[n] = f'{param_name}-B'
-        for j in range(param_str_len-1):
-            sign_data[n+j+1] = f'{param_name}-I'
-for word,sign in zip(word_data,sign_data):
-    g.write(f'{word}\t{sign}\n')
-f.close()
-g.close()
-
-#将生成的"子类编码_vec_300.txt"放入model作为词向量，将生成的"子类编码_train.txt"放入data作为训练集。
\ No newline at end of file