首次上传

f6030967 · Zhouxingyu · f6030967 · f6030967 · f6030967 · f6030967
Commit f6030967 authored Sep 29, 2019 by Zhouxingyu
10 changed files
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
+{
+    "python.pythonPath": "D:\\Python36\\python.exe"
+}
\ No newline at end of file
--- a/__pycache__/moudle.cpython-35.pyc
+++ b/__pycache__/moudle.cpython-35.pyc
--- a/__pycache__/moudle.cpython-36.pyc
+++ b/__pycache__/moudle.cpython-36.pyc
--- a/data/segment/oil.txt
+++ b/data/segment/oil.txt
--- a/data_train.py
+++ b/data_train.py
+import torch.nn as nn
+import gc
+import torch as t
+import numpy as np
+from gensim.models import word2vec
+import multiprocessing
+
+
+class classifier(nn.Module):
+    
+    def __init__(self, init_weights=True):
+        super(classifier, self).__init__()
+        self.word_class = nn.Sequential(
+            nn.Linear(128, 128),
+            nn.Linear(128, 64),
+            nn.Linear(64, 5),
+            #nn.ReLU(True),
+            #nn.Sigmoid(),
+        )
+    
+    def forward(self, x):
+        x = self.word_class(x)
+        #x = (x-0.5)*2
+        return x
+'''
+net = classifier()
+optimizer = t.optim.Adam(net.parameters(), lr=0.001)
+loss_function = nn.MSELoss()
+acc = lambda y1,y2: np.sqrt(np.sum(y1**2+y2**2)/len(y1))
+'''
+def sen_normal(str):
+    str = list(str)
+    c = []
+    d = []
+    for i in range(len(str)):
+        if str[i] == '（' or str[i] =='(':
+            str[i] = ' '
+            c.append(i)
+        elif str[i] == '）' or str[i] ==')':
+            str[i] = ' '
+            d.append(i)
+    for f,g in zip(c,d):
+        for i in range(f+1,g-1):
+            if str[i] == ' ':
+                str[i] = '$'
+    str = ''.join(str).replace('$', '')
+    return str
+
+def save_model(epoch, dir):
+    state = {'net':net.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
+    t.save(state, dir)
+
+def load_wordVectors(word2vec_path):
+    w2vModel = word2vec.Word2Vec.load(word2vec_path)
+    return w2vModel
+
+def train_wordVectors(sentences, embedding_size = 128, window = 5, min_count = 5):
+    w2vModel = word2vec.Word2Vec(sentences, size=embedding_size, window=window, min_count=min_count,workers=multiprocessing.cpu_count(),iter=10,hs=1)
+    return w2vModel
+
+def save_wordVectors(w2vModel,word2vec_path):
+    w2vModel.save(word2vec_path)
+
+def train(x, y):
+    global net
+    prediction = net(x)  # 把数据x喂给net，输出预测值
+    loss = loss_function(prediction, y)  # 计算两者的误差，要注意两个参数的顺序
+    optimizer.zero_grad()  # 清空上一步的更新参数值
+    loss.backward()  # 误差反相传播，计算新的更新参数值
+    optimizer.step()  # 将计算得到的更新值赋给net.parameters()
+    #aucc = acc(prediction.data.numpy(), y.data.numpy())
+    return loss, prediction
+
+#g = open('data_Processed.txt', 'a+', encoding='utf-8', errors='ignore')
+
+##############################
+
+def countNum(string):
+    result = [0, 0, 0]
+    for char in string:
+        if u'\u4e00' <= char <= u'\u9fa5':
+            result[0] += 1
+        elif char.isalpha():  
+            result[1] += 1
+        else:
+            result[2] += 1
+    return result                #0数字，1字母，2汉字。
+
+def judge_1(string):
+    n = countNum(string)
+    if n[0] == 0 and n[2] == 0 and n[1] != 0:
+        return 1
+    elif n[2] != 0:
+        return 3
+    else:
+        return 2
+
+def judge_2(string):
+    n = countNum(string)
+    if n[2] == 0:
+        return 2
+    else: return 3
+
+def class_select(length, position, word):
+    if length == 1:
+        return 0
+    
+    if length == 2:
+        if position == 0:
+            return 0
+        else:
+            return 4
+    
+    if length == 3:
+        if position == 0:
+            return 0
+        elif position == 2:
+            return 4
+        else:
+            return judge_1(word)
+    
+    if length == 4:
+        if position == 0:
+            return 0
+        elif position == 3:
+            return 4
+        elif position == 1:
+            return judge_1(word)
+        else:
+            return judge_2(word)
+
+    if length > 4:
+        if position == 0:
+            return 0
+        elif position == length-1:
+            return 4
+        elif position == 1:
+            return judge_1(word)
+        elif position == 2:
+            return judge_2(word)
+        else:
+            return 3
+
+def array_generate(length, position, word):
+    c = class_select(length, position, word)
+    y = [0]*5
+    y[c] = 1
+    return y
+######################
+
+
+net = classifier()
+optimizer = t.optim.Adam(net.parameters(), lr=0.001)
+loss_function = nn.MSELoss()
+acc = lambda y1,y2: np.sqrt(np.sum(y1**2+y2**2)/len(y1))
+
+
+def NN_classifier(word2vec_path='models/w2v/oil.model', NN_path='models/NN/oil.pth', iters=100):
+    #word2vec_path = 'models/oil.model'
+    model_w2v = load_wordVectors(word2vec_path)
+    x_data, y_data = [], []
+
+    for line in f:
+        line = sen_normal(line)
+        line = line.split('\ufeff')[-1].split('\n')[0]
+        #relpace_lyst = ['(', '(',  '（',  '）']
+        #for value in relpace_lyst:
+            #line = line.replace(value , ' ')
+        line = line.split(' ')
+        length = len(line)
+        if length != 0:
+            for i in range(length):
+                if line[i] != '':
+                    try:
+                        x_data.append(model_w2v[line[i]])
+                        y_data.append(array_generate(length, i, line[i]))
+                    except KeyError:
+                        #print(f'{line[i]}不在字典中。')
+                        continue
+
+    x = t.tensor(x_data)
+    y = t.tensor(y_data).float()
+    dataset = t.utils.data.TensorDataset(x, y)
+    dataloader = t.utils.data.DataLoader(dataset, batch_size=100, shuffle=True, num_workers=0, drop_last=True)
+    epoch = 0
+    net.cuda()
+    for j in range(iters):
+        for i, data in enumerate(dataloader):
+            input, label = data
+            loss,prediction = train(input.cuda(), label.cuda())
+        epoch += 1
+        aucc = acc(prediction.data.cpu().numpy(), label.data.numpy())
+        print("loss={} aucc={} epoch={}".format(loss.data.cpu().numpy(), aucc,epoch))
+        #save_model(epoch, f'./model_test/checkpoint1_{epoch}.pth')
+    save_model(epoch, NN_path)
+
+def w2v_train(segment_dir = './data/segment', word2vec_path = './models/w2v/oil.model'):
+    sentences = word2vec.PathLineSentences(segment_dir)
+    model2 = train_wordVectors(sentences, embedding_size=128, window=3, min_count=1)
+    save_wordVectors(model2, word2vec_path)
+    return model2
+
+
+f = open('oil.txt', 'r', encoding='utf-8', errors='ignore')
+g = open('./data/segment/oil.txt', 'a+', encoding='utf-8', errors='ignore')
+for line in f:
+    line = sen_normal(line)
+    g.write(line)
+g.close()
+f.close()
+f = open('./data/segment/oil.txt', 'r', encoding='utf-8', errors='ignore')
+w2v_train()
+NN_classifier()
\ No newline at end of file
--- a/models/NN/oil.pth
+++ b/models/NN/oil.pth
--- a/models/w2v/oil.model
+++ b/models/w2v/oil.model
--- a/oil.txt
+++ b/oil.txt
--- a/requirements.txt
+++ b/requirements.txt
+pytorch==1.2.0+cu92
+gensim==3.8.1
+numpy==1.17.0
\ No newline at end of file
--- a/sentence_normalize.py
+++ b/sentence_normalize.py
+import torch.nn as nn
+import gc
+import torch as t
+import numpy as np
+from gensim.models import word2vec
+
+class classifier(nn.Module):
+    
+    def __init__(self, init_weights=True):
+        super(classifier, self).__init__()
+        self.word_class = nn.Sequential(
+            nn.Linear(128, 128),
+            nn.Linear(128, 64),
+            nn.Linear(64, 5),
+            #nn.ReLU(True),
+            #nn.Sigmoid(),
+        )
+    
+    def forward(self, x):
+        x = self.word_class(x)
+        #x = (x-0.5)*2
+        return x
+
+net = classifier()
+optimizer = t.optim.Adam(net.parameters(), lr=0.01)
+loss_function = nn.MSELoss()
+acc = lambda y1,y2: np.sqrt(np.sum(y1**2+y2**2)/len(y1))
+
+def sen_normal(str):
+    c = []
+    d = []
+    str = list(str)
+    for i in range(len(str)):
+        if str[i] == '（' or str[i] =='(':
+            str[i] = ' '
+            c.append(i)
+        elif str[i] == '）' or str[i] ==')':
+            str[i] = ' '
+            d.append(i)
+    for f,g in zip(c,d):
+        for i in range(f+1,g-1):
+            if str[i] == ' ':
+                str[i] = '$'
+    str = ''.join(str).replace('$', '')
+    return str
+
+def train(x, y):
+    prediction = net(x)  # 把数据x喂给net，输出预测值
+    loss = loss_function(prediction, y)  # 计算两者的误差，要注意两个参数的顺序
+    optimizer.zero_grad()  # 清空上一步的更新参数值
+    loss.backward()  # 误差反相传播，计算新的更新参数值
+    optimizer.step()  # 将计算得到的更新值赋给net.parameters()
+    aucc = acc(prediction.data.numpy(),y.data.numpy())
+    return loss,aucc
+
+def save_model(epoch, dir):
+    state = {'net':net.state_dict(), 'optimizer':optimizer.state_dict(), 'epoch':epoch}
+    t.save(state, dir)
+
+
+def load_model(dir):
+    checkpoint = t.load(dir)
+    net.load_state_dict(checkpoint['net'])
+    optimizer.load_state_dict(checkpoint['optimizer'])
+    start_epoch = checkpoint['epoch'] + 1
+    return start_epoch
+
+def load_wordVectors(word2vec_path):
+    w2vModel = word2vec.Word2Vec.load(word2vec_path)
+    return w2vModel
+
+if __name__ =='__main__':
+    word2vec_path = './models/w2v/oil.model'
+    model_w2v = load_wordVectors(word2vec_path)
+    load_model('./models/NN/oil.pth')
+
+
+    def get_location(word):   #根据word取得位置参数
+        x = t.tensor(model_w2v[word])
+        a = net(x).tolist()
+        max = a[0]
+        m = 0
+        for i in range(1, len(a)):
+            if a[i] > max:
+                max = a[i]
+                m = i
+        return m
+
+
+    def list_na(n):
+        a = []
+        for i in range(n):
+            a.append([])
+        return a
+
+##############################
+
+    def countNum(string):
+        result = [0, 0, 0]
+        for char in string:
+            if u'\u4e00' <= char <= u'\u9fa5':
+                result[0] += 1
+            elif char.isalpha():  
+                result[1] += 1
+            else:
+                result[2] += 1
+        return result                #0数字，1字母，2汉字。
+
+    def judge_1(string):
+        n = countNum(string)
+        if n[0] == 0 and n[2] == 0 and n[1] != 0:
+            return 1
+        elif n[2] != 0:
+            return 3
+        else:
+            return 2
+
+    def judge_2(string):
+        n = countNum(string)
+        if n[2] == 0:
+            return 2
+        else: return 3
+
+    def class_select(length, position, word):
+        if length == 1:
+            return 0
+
+        if length == 2:
+            if position == 0:
+                return 0
+            else:
+                return 4
+
+        if length == 3:
+            if position == 0:
+                return 0
+            elif position == 2:
+                return 4
+            else:
+                return judge_1(word)
+
+        if length == 4:
+            if position == 0:
+                return 0
+            elif position == 3:
+                return 4
+            elif position == 1:
+                return judge_1(word)
+            else:
+                return judge_2(word)
+
+        if length > 4:
+            if position == 0:
+                return 0
+            elif position == length-1:
+                return 4
+            elif position == 1:
+                return judge_1(word)
+            elif position == 2:
+                return judge_2(word)
+            else:
+                return 3
+
+    def array_generate(length, position, word):
+        c = class_select(length, position, word)
+        y = [0]*5
+        y[c] = 1
+        return y
+######################
+    
+    def name_sort(sentence):
+        sentence = sen_normal(sentence)
+        class_list = list_na(5)
+        sentence = sentence.split(' ')
+        sorted_sentence = []
+        for i in range(len(sentence)-1, -1, -1):
+            try:
+                location = get_location(sentence[i])
+                class_list[location].append(sentence[i])
+            except KeyError:
+                location = class_select(len(sentence), i, sentence[i])
+                class_list[location].append(sentence[i])
+        for level in class_list:
+            for value in reversed(level): #反向迭代reversed
+                sorted_sentence.append(value)
+        sorted_sentence = ' '.join(sorted_sentence)
+        return sorted_sentence
+    print(net(t.tensor(model_w2v['捆绑酱油或香油，赠品随机发放，赠完为止，'])))
+    # 金龙鱼 食用油两件套（阳光葵花籽油3.618L+玉米油3.618L）
+    print(name_sort('非转基因 压榨一级黄金产地玉米胚芽油 900ML 福临门 '))
+    #print(get_location('HP'))
\ No newline at end of file