Commit 012513ce authored by Zhouxingyu's avatar Zhouxingyu

模型升级,准确度89%,添加调用方法。

parent e6679741
...@@ -169,6 +169,20 @@ Time usage: 0:00:05 ...@@ -169,6 +169,20 @@ Time usage: 0:00:05
输出在目录文件夹,名称为`predicted_data.txt` 输出在目录文件夹,名称为`predicted_data.txt`
### 功能调用
调用方法为:
```python
from run_cnn import name2subcategory
name_list = ['安赛瑞 消防斧 应急斧头破拆斧工具 消防器材斧子消防站配置 消防检查斧子 25847']
a = name2subcategory()
category = a.namelyst_predict(name_list)
```
输入一个含有多个产品名称的列表,返回一个各名称子类的列表。
## RNN循环神经网络 ## RNN循环神经网络
### 和上面类似,代码尚未修改,可以根据CNN代码自行修改为训练自己模型的格式。 ### 和上面类似,代码尚未修改,可以根据CNN代码自行修改为训练自己模型的格式。
\ No newline at end of file
...@@ -5,21 +5,23 @@ import tensorflow as tf ...@@ -5,21 +5,23 @@ import tensorflow as tf
class TCNNConfig(object): class TCNNConfig(object):
"""CNN配置参数""" """CNN配置参数"""
def __init__(self, num_classes):
self.num_classes = num_classes
embedding_dim = 64 # 词向量维度 embedding_dim = 128 # 词向量维度
seq_length = 300 # 序列长度 seq_length = 300 # 序列长度
num_classes = 1078 # 类别数 #num_classes = 668 # 类别数
num_filters = 256 # 卷积核数目 num_filters = 1024 # 卷积核数目
kernel_size = 5 # 卷积核尺寸 kernel_size = 3 # 卷积核尺寸
vocab_size = 5000 # 词汇表大小 vocab_size = 5000 # 词汇表大小
hidden_dim = 128 # 全连接层神经元 hidden_dim = 256 # 全连接层神经元
dropout_keep_prob = 0.5 # dropout保留比例 dropout_keep_prob = 0.5 # dropout保留比例
learning_rate = 1e-3 # 学习率 learning_rate = 1e-3 # 学习率
batch_size = 64 # 每批训练大小 batch_size = 64 # 每批训练大小
num_epochs = 10 # 总迭代轮次 num_epochs = 20 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果 print_per_batch = 100 # 每多少轮输出一次结果
save_per_batch = 10 # 每多少轮存入tensorboard save_per_batch = 10 # 每多少轮存入tensorboard
...@@ -46,19 +48,19 @@ class TextCNN(object): ...@@ -46,19 +48,19 @@ class TextCNN(object):
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x) embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope("cnn"): with tf.name_scope("cnn"):
# CNN layer # CNN layer 3*3
conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv') conv_1 = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv_1')
# global max pooling layer # global max pooling layer
gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp') gmp_1 = tf.reduce_max(conv_1, reduction_indices=[1], name='gmp_1')
with tf.name_scope("score"): with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活 # 全连接层,后面接dropout以及relu激活
fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1') fc_1 = tf.layers.dense(gmp_1, self.config.hidden_dim, name='fc_1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob) fc_1 = tf.contrib.layers.dropout(fc_1, self.keep_prob)
fc = tf.nn.relu(fc) fc_1 = tf.nn.relu(fc_1)
# 分类器 # 分类器
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') self.logits = tf.layers.dense(fc_1, self.config.num_classes, name='fc_2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别 self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别
with tf.name_scope("optimize"): with tf.name_scope("optimize"):
......
# coding: utf-8
import tensorflow as tf
class TCNNConfig(object):
"""RNN配置参数"""
def __init__(self, num_classes):
self.num_classes = num_classes
# 模型参数
embedding_dim = 64 # 词向量维度
seq_length = 600 # 序列长度
#num_classes = 8 # 类别数
vocab_size = 5000 # 词汇表达小
num_layers= 2 # 隐藏层层数
hidden_dim = 128 # 隐藏层神经元
rnn = 'gru' # lstm 或 gru
dropout_keep_prob = 0.8 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 128 # 每批训练大小
num_epochs = 10 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
save_per_batch = 10 # 每多少轮存入tensorboard
class TextCNN(object):
"""文本分类,RNN模型"""
def __init__(self, config):
self.config = config
# 三个待输入的数据
self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.rnn()
def rnn(self):
"""rnn模型"""
def lstm_cell(): # lstm核
return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
def gru_cell(): # gru核
return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
def dropout(): # 为每一个rnn核后面加一个dropout层
if (self.config.rnn == 'lstm'):
cell = lstm_cell()
else:
cell = gru_cell()
return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
# 词向量映射
with tf.device('/cpu:0'):
embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope("rnn"):
# 多层rnn网络
cells = [dropout() for _ in range(self.config.num_layers)]
rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
_outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
last = _outputs[:, -1, :] # 取最后一个时序输出作为结果
with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活
fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
# 分类器
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别
with tf.name_scope("optimize"):
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
# 优化器
self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
\ No newline at end of file
...@@ -4,7 +4,7 @@ import sys ...@@ -4,7 +4,7 @@ import sys
from collections import Counter from collections import Counter
import numpy as np import numpy as np
import tensorflow.keras as kr from tensorflow import keras as kr
import pandas as pd import pandas as pd
if sys.version_info[0] > 2: if sys.version_info[0] > 2:
...@@ -83,8 +83,9 @@ def read_vocab(vocab_dir): ...@@ -83,8 +83,9 @@ def read_vocab(vocab_dir):
def read_category(): def read_category():
"""读取分类目录,固定""" """读取分类目录,固定"""
table = pd.read_excel('VW_ProductALLState.xlsx') table = pd.read_excel('Product_Api_Data.xlsx')
categories = list(set(table['SubCategoryName'].tolist())) categories = list(set(table['SubCategoryName'].tolist()))
categories.sort(key = table['SubCategoryName'].tolist().index)
#categories = ['0', '1'] #categories = ['0', '1']
categories = [native_content(x) for x in categories] categories = [native_content(x) for x in categories]
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pandas as pd import pandas as pd
from function import Index from function import Index
table = pd.read_excel('VW_ProductALLState.xlsx') table = pd.read_excel('Product_Api_Data.xlsx')
#category_set = list(set(table['SubCategoryName'].tolist())) #category_set = list(set(table['categoryCodeName'].tolist()))
#len(list(category_set)) #len(list(category_set))
f = open('D:/Users/86183/Desktop/text-classification-cnn-rnn/data/name2category/name2category.val.txt','w',encoding='utf-8') f = open('D:/Users/86183/Desktop/text-classification-cnn-rnn/data/name2category/name2category.val.txt','w',encoding='utf-8')
g = open('D:/Users/86183/Desktop/text-classification-cnn-rnn/data/name2category/name2category.train.txt','w',encoding='utf-8') g = open('D:/Users/86183/Desktop/text-classification-cnn-rnn/data/name2category/name2category.train.txt','w',encoding='utf-8')
...@@ -12,6 +12,7 @@ len_table = len(table) ...@@ -12,6 +12,7 @@ len_table = len(table)
index = Index() index = Index()
for category,name in zip(table['SubCategoryName'],table['ProductName']): for category,name in zip(table['SubCategoryName'],table['ProductName']):
name = str(name).replace('\t',' ') name = str(name).replace('\t',' ')
category = str(category)
if '错误子类' not in category: if '错误子类' not in category:
if m%13 == 1: if m%13 == 1:
f.write(f'{category}\t{name}\n') f.write(f'{category}\t{name}\n')
......
迷你植物 格林王 GREENKING 英国原装进口 IPA印度麦啤 精酿啤酒印度淡色艾尔啤酒 500ML*6瓶 整箱装 迷你植物 格林王 GREENKING 英国原装进口 IPA印度麦啤 精酿啤酒印度淡色艾尔啤酒 500ML*6瓶
\ No newline at end of file \ No newline at end of file
...@@ -15,6 +15,7 @@ from sklearn import metrics ...@@ -15,6 +15,7 @@ from sklearn import metrics
from cnn_model import TCNNConfig, TextCNN from cnn_model import TCNNConfig, TextCNN
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab
import pandas as pd import pandas as pd
from function import Index
base_dir = 'data/name2category' base_dir = 'data/name2category'
train_dir = os.path.join(base_dir, 'name2category.train.txt') train_dir = os.path.join(base_dir, 'name2category.train.txt')
...@@ -182,12 +183,59 @@ def test(): ...@@ -182,12 +183,59 @@ def test():
time_dif = get_time_dif(start_time) time_dif = get_time_dif(start_time)
print("Time usage:", time_dif) print("Time usage:", time_dif)
class name2subcategory():
def __init__(self):
print('Configuring CNN model...')
if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size)
self.categories, cat_to_id = read_category()
words, self.word_to_id = read_vocab(vocab_dir)
self.table = pd.read_excel('Product_Api_Data.xlsx')
category_set = list(set(self.table['SubCategoryName'].tolist()))
self.config = TCNNConfig(len(list(category_set)))
self.config.vocab_size = len(words)
self.model = TextCNN(self.config)
self.categories = list(set(self.table['SubCategoryName'].tolist()))
self.categories.sort(key = self.table['SubCategoryName'].tolist().index)
def namelyst_predict(self, contents):
from tensorflow import keras as kr
print("Loading predicted data...")
data_id = []
for i in range(len(contents)):
data_id.append([self.word_to_id[x] for x in contents[i] if x in self.word_to_id])
x_pred = kr.preprocessing.sequence.pad_sequences(data_id, self.config.seq_length)
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path) # 读取保存的模型
batch_size = 128
data_len = len(x_pred)
num_batch = int((data_len - 1) / batch_size) + 1
y_pred_cls = np.zeros(shape=len(x_pred), dtype=np.int32) # 保存预测结果
for i in range(num_batch): # 逐批次处理
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
feed_dict = {
self.model.input_x: x_pred[start_id:end_id],
self.model.keep_prob: 1.0
}
y_pred_cls[start_id:end_id] = session.run(self.model.y_pred_cls, feed_dict=feed_dict) # y_pred_cls为预测的list。
y_pred_list = []
for m in range(len(y_pred_cls)):
y_pred_list.append(self.categories[y_pred_cls[m]])
return y_pred_list
def predict(): def predict():
import tensorflow.keras as kr from tensorflow import keras as kr
table = pd.read_excel('VW_ProductALLState.xlsx') table = pd.read_excel('Product_Api_Data.xlsx')
categories = list(set(table['SubCategoryName'].tolist())) categories = list(set(table['SubCategoryName'].tolist()))
categories.sort(key = table['SubCategoryName'].tolist().index)
print("Loading predicted data...") print("Loading predicted data...")
f = open(pred_dir, 'r', encoding='utf-8', errors='ignore') f = open(pred_dir, 'r', encoding='utf-8', errors='ignore')
contents = [] contents = []
...@@ -235,10 +283,14 @@ def predict(): ...@@ -235,10 +283,14 @@ def predict():
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test', 'predict']: if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test', 'predict']:
raise ValueError("""usage: python run_cnn.py [train / test]""") raise ValueError("""usage: python run_cnn.py [train / test / predict]""")
print('Configuring CNN model...') print('Configuring CNN model...')
config = TCNNConfig() table = pd.read_excel('Product_Api_Data.xlsx')
SubCategoryName_list = table['SubCategoryName'].tolist()
category_set = list(set(SubCategoryName_list))
category_set.sort(key = SubCategoryName_list.index)
config = TCNNConfig(len(category_set))
if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size) build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category() categories, cat_to_id = read_category()
......
from run_cnn import name2subcategory
name_list = ['安赛瑞 消防斧 应急斧头破拆斧工具 消防器材斧子消防站配置 消防检查斧子 25847']
a = name2subcategory()
category = a.namelyst_predict(name_list)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment