Commit 012513ce authored by Zhouxingyu's avatar Zhouxingyu

模型升级,准确度89%,添加调用方法。

parent e6679741
......@@ -169,6 +169,20 @@ Time usage: 0:00:05
输出在目录文件夹,名称为`predicted_data.txt`
### 功能调用
调用方法为:
```python
from run_cnn import name2subcategory
name_list = ['安赛瑞 消防斧 应急斧头破拆斧工具 消防器材斧子消防站配置 消防检查斧子 25847']
a = name2subcategory()
category = a.namelyst_predict(name_list)
```
输入一个含有多个产品名称的列表,返回一个各名称子类的列表。
## RNN循环神经网络
### 和上面类似,代码尚未修改,可以根据CNN代码自行修改为训练自己模型的格式。
\ No newline at end of file
......@@ -5,21 +5,23 @@ import tensorflow as tf
class TCNNConfig(object):
"""CNN配置参数"""
def __init__(self, num_classes):
self.num_classes = num_classes
embedding_dim = 64 # 词向量维度
embedding_dim = 128 # 词向量维度
seq_length = 300 # 序列长度
num_classes = 1078 # 类别数
num_filters = 256 # 卷积核数目
kernel_size = 5 # 卷积核尺寸
#num_classes = 668 # 类别数
num_filters = 1024 # 卷积核数目
kernel_size = 3 # 卷积核尺寸
vocab_size = 5000 # 词汇表大小
hidden_dim = 128 # 全连接层神经元
hidden_dim = 256 # 全连接层神经元
dropout_keep_prob = 0.5 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 64 # 每批训练大小
num_epochs = 10 # 总迭代轮次
num_epochs = 20 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
save_per_batch = 10 # 每多少轮存入tensorboard
......@@ -46,19 +48,19 @@ class TextCNN(object):
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope("cnn"):
# CNN layer
conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
# CNN layer 3*3
conv_1 = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv_1')
# global max pooling layer
gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp')
gmp_1 = tf.reduce_max(conv_1, reduction_indices=[1], name='gmp_1')
with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活
fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
fc_1 = tf.layers.dense(gmp_1, self.config.hidden_dim, name='fc_1')
fc_1 = tf.contrib.layers.dropout(fc_1, self.keep_prob)
fc_1 = tf.nn.relu(fc_1)
# 分类器
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.logits = tf.layers.dense(fc_1, self.config.num_classes, name='fc_2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别
with tf.name_scope("optimize"):
......
# coding: utf-8
import tensorflow as tf
class TCNNConfig(object):
"""RNN配置参数"""
def __init__(self, num_classes):
self.num_classes = num_classes
# 模型参数
embedding_dim = 64 # 词向量维度
seq_length = 600 # 序列长度
#num_classes = 8 # 类别数
vocab_size = 5000 # 词汇表达小
num_layers= 2 # 隐藏层层数
hidden_dim = 128 # 隐藏层神经元
rnn = 'gru' # lstm 或 gru
dropout_keep_prob = 0.8 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 128 # 每批训练大小
num_epochs = 10 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
save_per_batch = 10 # 每多少轮存入tensorboard
class TextCNN(object):
"""文本分类,RNN模型"""
def __init__(self, config):
self.config = config
# 三个待输入的数据
self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.rnn()
def rnn(self):
"""rnn模型"""
def lstm_cell(): # lstm核
return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
def gru_cell(): # gru核
return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
def dropout(): # 为每一个rnn核后面加一个dropout层
if (self.config.rnn == 'lstm'):
cell = lstm_cell()
else:
cell = gru_cell()
return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
# 词向量映射
with tf.device('/cpu:0'):
embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope("rnn"):
# 多层rnn网络
cells = [dropout() for _ in range(self.config.num_layers)]
rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
_outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
last = _outputs[:, -1, :] # 取最后一个时序输出作为结果
with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活
fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
# 分类器
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别
with tf.name_scope("optimize"):
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=self.logits, labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
# 优化器
self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
\ No newline at end of file
......@@ -4,7 +4,7 @@ import sys
from collections import Counter
import numpy as np
import tensorflow.keras as kr
from tensorflow import keras as kr
import pandas as pd
if sys.version_info[0] > 2:
......@@ -83,8 +83,9 @@ def read_vocab(vocab_dir):
def read_category():
"""读取分类目录,固定"""
table = pd.read_excel('VW_ProductALLState.xlsx')
table = pd.read_excel('Product_Api_Data.xlsx')
categories = list(set(table['SubCategoryName'].tolist()))
categories.sort(key = table['SubCategoryName'].tolist().index)
#categories = ['0', '1']
categories = [native_content(x) for x in categories]
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pandas as pd
from function import Index
table = pd.read_excel('VW_ProductALLState.xlsx')
#category_set = list(set(table['SubCategoryName'].tolist()))
table = pd.read_excel('Product_Api_Data.xlsx')
#category_set = list(set(table['categoryCodeName'].tolist()))
#len(list(category_set))
f = open('D:/Users/86183/Desktop/text-classification-cnn-rnn/data/name2category/name2category.val.txt','w',encoding='utf-8')
g = open('D:/Users/86183/Desktop/text-classification-cnn-rnn/data/name2category/name2category.train.txt','w',encoding='utf-8')
......@@ -12,6 +12,7 @@ len_table = len(table)
index = Index()
for category,name in zip(table['SubCategoryName'],table['ProductName']):
name = str(name).replace('\t',' ')
category = str(category)
if '错误子类' not in category:
if m%13 == 1:
f.write(f'{category}\t{name}\n')
......
迷你植物 格林王 GREENKING 英国原装进口 IPA印度麦啤 精酿啤酒印度淡色艾尔啤酒 500ML*6瓶 整箱装
\ No newline at end of file
迷你植物 格林王 GREENKING 英国原装进口 IPA印度麦啤 精酿啤酒印度淡色艾尔啤酒 500ML*6瓶
\ No newline at end of file
......@@ -15,6 +15,7 @@ from sklearn import metrics
from cnn_model import TCNNConfig, TextCNN
from data.cnews_loader import read_vocab, read_category, batch_iter, process_file, build_vocab
import pandas as pd
from function import Index
base_dir = 'data/name2category'
train_dir = os.path.join(base_dir, 'name2category.train.txt')
......@@ -182,12 +183,59 @@ def test():
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
class name2subcategory():
def __init__(self):
print('Configuring CNN model...')
if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size)
self.categories, cat_to_id = read_category()
words, self.word_to_id = read_vocab(vocab_dir)
self.table = pd.read_excel('Product_Api_Data.xlsx')
category_set = list(set(self.table['SubCategoryName'].tolist()))
self.config = TCNNConfig(len(list(category_set)))
self.config.vocab_size = len(words)
self.model = TextCNN(self.config)
self.categories = list(set(self.table['SubCategoryName'].tolist()))
self.categories.sort(key = self.table['SubCategoryName'].tolist().index)
def namelyst_predict(self, contents):
from tensorflow import keras as kr
print("Loading predicted data...")
data_id = []
for i in range(len(contents)):
data_id.append([self.word_to_id[x] for x in contents[i] if x in self.word_to_id])
x_pred = kr.preprocessing.sequence.pad_sequences(data_id, self.config.seq_length)
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path) # 读取保存的模型
batch_size = 128
data_len = len(x_pred)
num_batch = int((data_len - 1) / batch_size) + 1
y_pred_cls = np.zeros(shape=len(x_pred), dtype=np.int32) # 保存预测结果
for i in range(num_batch): # 逐批次处理
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
feed_dict = {
self.model.input_x: x_pred[start_id:end_id],
self.model.keep_prob: 1.0
}
y_pred_cls[start_id:end_id] = session.run(self.model.y_pred_cls, feed_dict=feed_dict) # y_pred_cls为预测的list。
y_pred_list = []
for m in range(len(y_pred_cls)):
y_pred_list.append(self.categories[y_pred_cls[m]])
return y_pred_list
def predict():
import tensorflow.keras as kr
from tensorflow import keras as kr
table = pd.read_excel('VW_ProductALLState.xlsx')
table = pd.read_excel('Product_Api_Data.xlsx')
categories = list(set(table['SubCategoryName'].tolist()))
categories.sort(key = table['SubCategoryName'].tolist().index)
print("Loading predicted data...")
f = open(pred_dir, 'r', encoding='utf-8', errors='ignore')
contents = []
......@@ -235,10 +283,14 @@ def predict():
if __name__ == '__main__':
if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test', 'predict']:
raise ValueError("""usage: python run_cnn.py [train / test]""")
raise ValueError("""usage: python run_cnn.py [train / test / predict]""")
print('Configuring CNN model...')
config = TCNNConfig()
table = pd.read_excel('Product_Api_Data.xlsx')
SubCategoryName_list = table['SubCategoryName'].tolist()
category_set = list(set(SubCategoryName_list))
category_set.sort(key = SubCategoryName_list.index)
config = TCNNConfig(len(category_set))
if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建
build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
......
from run_cnn import name2subcategory
name_list = ['安赛瑞 消防斧 应急斧头破拆斧工具 消防器材斧子消防站配置 消防检查斧子 25847']
a = name2subcategory()
category = a.namelyst_predict(name_list)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment