Commit f6030967 authored by Zhouxingyu's avatar Zhouxingyu

首次上传

parents
Pipeline #70 failed with stages
{
"python.pythonPath": "D:\\Python36\\python.exe"
}
\ No newline at end of file
This diff is collapsed.
import torch.nn as nn
import gc
import torch as t
import numpy as np
from gensim.models import word2vec
import multiprocessing
class classifier(nn.Module):
def __init__(self, init_weights=True):
super(classifier, self).__init__()
self.word_class = nn.Sequential(
nn.Linear(128, 128),
nn.Linear(128, 64),
nn.Linear(64, 5),
#nn.ReLU(True),
#nn.Sigmoid(),
)
def forward(self, x):
x = self.word_class(x)
#x = (x-0.5)*2
return x
'''
net = classifier()
optimizer = t.optim.Adam(net.parameters(), lr=0.001)
loss_function = nn.MSELoss()
acc = lambda y1,y2: np.sqrt(np.sum(y1**2+y2**2)/len(y1))
'''
def sen_normal(str):
str = list(str)
c = []
d = []
for i in range(len(str)):
if str[i] == '(' or str[i] =='(':
str[i] = ' '
c.append(i)
elif str[i] == ')' or str[i] ==')':
str[i] = ' '
d.append(i)
for f,g in zip(c,d):
for i in range(f+1,g-1):
if str[i] == ' ':
str[i] = '$'
str = ''.join(str).replace('$', '')
return str
def save_model(epoch, dir):
state = {'net':net.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
t.save(state, dir)
def load_wordVectors(word2vec_path):
w2vModel = word2vec.Word2Vec.load(word2vec_path)
return w2vModel
def train_wordVectors(sentences, embedding_size = 128, window = 5, min_count = 5):
w2vModel = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count(),iter=10,hs=1)
return w2vModel
def save_wordVectors(w2vModel,word2vec_path):
w2vModel.save(word2vec_path)
def train(x, y):
global net
prediction = net(x) # 把数据x喂给net,输出预测值
loss = loss_function(prediction, y) # 计算两者的误差,要注意两个参数的顺序
optimizer.zero_grad() # 清空上一步的更新参数值
loss.backward() # 误差反相传播,计算新的更新参数值
optimizer.step() # 将计算得到的更新值赋给net.parameters()
#aucc = acc(prediction.data.numpy(), y.data.numpy())
return loss, prediction
#g = open('data_Processed.txt', 'a+', encoding='utf-8', errors='ignore')
##############################
def countNum(string):
result = [0, 0, 0]
for char in string:
if u'\u4e00' <= char <= u'\u9fa5':
result[0] += 1
elif char.isalpha():
result[1] += 1
else:
result[2] += 1
return result #0数字,1字母,2汉字。
def judge_1(string):
n = countNum(string)
if n[0] == 0 and n[2] == 0 and n[1] != 0:
return 1
elif n[2] != 0:
return 3
else:
return 2
def judge_2(string):
n = countNum(string)
if n[2] == 0:
return 2
else: return 3
def class_select(length, position, word):
if length == 1:
return 0
if length == 2:
if position == 0:
return 0
else:
return 4
if length == 3:
if position == 0:
return 0
elif position == 2:
return 4
else:
return judge_1(word)
if length == 4:
if position == 0:
return 0
elif position == 3:
return 4
elif position == 1:
return judge_1(word)
else:
return judge_2(word)
if length > 4:
if position == 0:
return 0
elif position == length-1:
return 4
elif position == 1:
return judge_1(word)
elif position == 2:
return judge_2(word)
else:
return 3
def array_generate(length, position, word):
c = class_select(length, position, word)
y = [0]*5
y[c] = 1
return y
######################
net = classifier()
optimizer = t.optim.Adam(net.parameters(), lr=0.001)
loss_function = nn.MSELoss()
acc = lambda y1,y2: np.sqrt(np.sum(y1**2+y2**2)/len(y1))
def NN_classifier(word2vec_path='models/w2v/oil.model', NN_path='models/NN/oil.pth', iters=100):
#word2vec_path = 'models/oil.model'
model_w2v = load_wordVectors(word2vec_path)
x_data, y_data = [], []
for line in f:
line = sen_normal(line)
line = line.split('\ufeff')[-1].split('\n')[0]
#relpace_lyst = ['(', '(', '(', ')']
#for value in relpace_lyst:
#line = line.replace(value , ' ')
line = line.split(' ')
length = len(line)
if length != 0:
for i in range(length):
if line[i] != '':
try:
x_data.append(model_w2v[line[i]])
y_data.append(array_generate(length, i, line[i]))
except KeyError:
#print(f'{line[i]}不在字典中。')
continue
x = t.tensor(x_data)
y = t.tensor(y_data).float()
dataset = t.utils.data.TensorDataset(x, y)
dataloader = t.utils.data.DataLoader(dataset, batch_size=100, shuffle=True, num_workers=0, drop_last=True)
epoch = 0
net.cuda()
for j in range(iters):
for i, data in enumerate(dataloader):
input, label = data
loss,prediction = train(input.cuda(), label.cuda())
epoch += 1
aucc = acc(prediction.data.cpu().numpy(), label.data.numpy())
print("loss={} aucc={} epoch={}".format(loss.data.cpu().numpy(), aucc,epoch))
#save_model(epoch, f'./model_test/checkpoint1_{epoch}.pth')
save_model(epoch, NN_path)
def w2v_train(segment_dir = './data/segment', word2vec_path = './models/w2v/oil.model'):
sentences = word2vec.PathLineSentences(segment_dir)
model2 = train_wordVectors(sentences, embedding_size=128, window=3, min_count=1)
save_wordVectors(model2, word2vec_path)
return model2
f = open('oil.txt', 'r', encoding='utf-8', errors='ignore')
g = open('./data/segment/oil.txt', 'a+', encoding='utf-8', errors='ignore')
for line in f:
line = sen_normal(line)
g.write(line)
g.close()
f.close()
f = open('./data/segment/oil.txt', 'r', encoding='utf-8', errors='ignore')
w2v_train()
NN_classifier()
\ No newline at end of file
This diff is collapsed.
pytorch==1.2.0+cu92
gensim==3.8.1
numpy==1.17.0
\ No newline at end of file
import torch.nn as nn
import gc
import torch as t
import numpy as np
from gensim.models import word2vec
class classifier(nn.Module):
def __init__(self, init_weights=True):
super(classifier, self).__init__()
self.word_class = nn.Sequential(
nn.Linear(128, 128),
nn.Linear(128, 64),
nn.Linear(64, 5),
#nn.ReLU(True),
#nn.Sigmoid(),
)
def forward(self, x):
x = self.word_class(x)
#x = (x-0.5)*2
return x
net = classifier()
optimizer = t.optim.Adam(net.parameters(), lr=0.01)
loss_function = nn.MSELoss()
acc = lambda y1,y2: np.sqrt(np.sum(y1**2+y2**2)/len(y1))
def sen_normal(str):
c = []
d = []
str = list(str)
for i in range(len(str)):
if str[i] == '(' or str[i] =='(':
str[i] = ' '
c.append(i)
elif str[i] == ')' or str[i] ==')':
str[i] = ' '
d.append(i)
for f,g in zip(c,d):
for i in range(f+1,g-1):
if str[i] == ' ':
str[i] = '$'
str = ''.join(str).replace('$', '')
return str
def train(x, y):
prediction = net(x) # 把数据x喂给net,输出预测值
loss = loss_function(prediction, y) # 计算两者的误差,要注意两个参数的顺序
optimizer.zero_grad() # 清空上一步的更新参数值
loss.backward() # 误差反相传播,计算新的更新参数值
optimizer.step() # 将计算得到的更新值赋给net.parameters()
aucc = acc(prediction.data.numpy(),y.data.numpy())
return loss,aucc
def save_model(epoch, dir):
state = {'net':net.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
t.save(state, dir)
def load_model(dir):
checkpoint = t.load(dir)
net.load_state_dict(checkpoint['net'])
optimizer.load_state_dict(checkpoint['optimizer'])
start_epoch = checkpoint['epoch'] + 1
return start_epoch
def load_wordVectors(word2vec_path):
w2vModel = word2vec.Word2Vec.load(word2vec_path)
return w2vModel
if __name__ =='__main__':
word2vec_path = './models/w2v/oil.model'
model_w2v = load_wordVectors(word2vec_path)
load_model('./models/NN/oil.pth')
def get_location(word): #根据word取得位置参数
x = t.tensor(model_w2v[word])
a = net(x).tolist()
max = a[0]
m = 0
for i in range(1, len(a)):
if a[i] > max:
max = a[i]
m = i
return m
def list_na(n):
a = []
for i in range(n):
a.append([])
return a
##############################
def countNum(string):
result = [0, 0, 0]
for char in string:
if u'\u4e00' <= char <= u'\u9fa5':
result[0] += 1
elif char.isalpha():
result[1] += 1
else:
result[2] += 1
return result #0数字,1字母,2汉字。
def judge_1(string):
n = countNum(string)
if n[0] == 0 and n[2] == 0 and n[1] != 0:
return 1
elif n[2] != 0:
return 3
else:
return 2
def judge_2(string):
n = countNum(string)
if n[2] == 0:
return 2
else: return 3
def class_select(length, position, word):
if length == 1:
return 0
if length == 2:
if position == 0:
return 0
else:
return 4
if length == 3:
if position == 0:
return 0
elif position == 2:
return 4
else:
return judge_1(word)
if length == 4:
if position == 0:
return 0
elif position == 3:
return 4
elif position == 1:
return judge_1(word)
else:
return judge_2(word)
if length > 4:
if position == 0:
return 0
elif position == length-1:
return 4
elif position == 1:
return judge_1(word)
elif position == 2:
return judge_2(word)
else:
return 3
def array_generate(length, position, word):
c = class_select(length, position, word)
y = [0]*5
y[c] = 1
return y
######################
def name_sort(sentence):
sentence = sen_normal(sentence)
class_list = list_na(5)
sentence = sentence.split(' ')
sorted_sentence = []
for i in range(len(sentence)-1, -1, -1):
try:
location = get_location(sentence[i])
class_list[location].append(sentence[i])
except KeyError:
location = class_select(len(sentence), i, sentence[i])
class_list[location].append(sentence[i])
for level in class_list:
for value in reversed(level): #反向迭代reversed
sorted_sentence.append(value)
sorted_sentence = ' '.join(sorted_sentence)
return sorted_sentence
print(net(t.tensor(model_w2v['捆绑酱油或香油,赠品随机发放,赠完为止,'])))
# 金龙鱼 食用油两件套(阳光葵花籽油3.618L+玉米油3.618L)
print(name_sort('非转基因 压榨一级黄金产地玉米胚芽油 900ML 福临门 '))
#print(get_location('HP'))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment