添加了pdf中的下路的型号比对

e7193e6e · Zhouxingyu · 4066f082 · e7193e6e · e7193e6e · e7193e6e
Commit e7193e6e authored Sep 02, 2019 by Zhouxingyu
18 changed files
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
 {
-    "python.pythonPath": "E:\\Python36\\python.exe"
+    "python.pythonPath": "D:\\Python36\\python.exe"
 }
\ No newline at end of file
--- a/1122京东平安0812.xlsx
+++ b/1122京东平安0812.xlsx
--- a/__pycache__/brand_name.cpython-36.pyc
+++ b/__pycache__/brand_name.cpython-36.pyc
--- a/__pycache__/moudules.cpython-36.pyc
+++ b/__pycache__/moudules.cpython-36.pyc
--- a/__pycache__/product_type_extract.cpython-36.pyc
+++ b/__pycache__/product_type_extract.cpython-36.pyc
--- a/brand_name.py
+++ b/brand_name.py
+import jieba
+import re
+import xlrd
+import pandas as pd
+import numpy as np
+class brandd:
+    """提取品牌名"""
+    def __init__(self, addr):
+        self.addr = addr
+    def contain_let(self, string):
+        """判断字符串string中包含字母"""
+        return bool(re.search('[A-z]', string))
+    def contain_chin(self, string):
+        """判断字符串string中是否包含汉字"""
+        zhmodel = re.compile(u'[\u4e00-\u9fa5]')
+        match = zhmodel.search(string)
+        if match:
+            boolValue = True
+        else:
+            boolValue = False
+        return boolValue
+    def read_in_1(self, addr_in):
+        """从excel文件中读取数据
+        addr_in为缺失的csv文件地址"""
+        datafile = xlrd.open_workbook(addr_in)
+        table = datafile.sheets()[0]
+        matrix_text = pd.DataFrame([])
+        for i in range(table.ncols):
+            matrix_text[i] = table.col_values(i)
+        matrix_text.rename(columns=matrix_text.iloc[0, :], inplace=True)
+        matrix_text.drop([0], axis=0, inplace=True)
+        type_nonstand = matrix_text[['BAND_NAME']]  # 匹配的非标准数据
+        BrandName_data = []
+        for i in range(len(type_nonstand)):
+            string_sin = jieba.cut(type_nonstand.iloc[i, 0])
+            for string in string_sin:
+                if self.contain_let(string) or self.contain_chin(string):
+                    BrandName_data.append(string)
+        return list(set(BrandName_data))
+    def runing(self):
+        brandname = self.read_in_1(self.addr)
+        return brandname
--- a/database.py
+++ b/database.py
@@ -3,6 +3,8 @@ import pandas as pd
 sqlserver = sql_find()
+similar = similar()
 def logic_function(table, price_name='jd_price'):
    sku_lyst = []
    price_lyst = []
@@ -12,7 +14,7 @@ def logic_function(table, price_name='jd_price'):
            if productcode != 'NULL':
                sku_label, symbol = left(productcode, sku, product_price)          #symbol为0为没有修改的，为1为修改了的，为6为放入新品库的。
            else:
-                sku_label, symbol = right(sku, name, product_price)
+                sku_label, symbol = right(sku, str(name), product_price)
            sku_lyst.append(sku_label)
            if symbol == 1:
                price_lyst.append(m)
@@ -53,9 +55,10 @@ def left(productcode, sku, product_price):
 def right(sku, name, product_price):
    if similar.istrue(name):        #此处为similar类找到数据库中匹配的型号，返回True。
-        productcode = similar.productcode_get(name)          #此处为提取与该型号所匹配产品的productcode。
+        productcode = similar.productcode_get()          #此处为提取与该型号所匹配产品的productcode。
-        left(productcode, sku)
+        left(productcode, sku, product_price)
    else:
        return f'{sku}_放入新品建库'

--- a/moudules.py
+++ b/moudules.py
 import pymssql
 import pandas as pd
 import xlrd
+from product_type_extract import *
+from brand_name import brandd
+import numpy as np
 def data_load(path):
@@ -23,21 +26,33 @@ def pre(matrix_text, addr):
    matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
 class similar():
+    '''
+    查询44w条型号速度太慢，应该对'名称提取型号.csv'的查询优化算法。
+    '''
    def __init__(self):
-        pass
+        self.type_table = pd.read_csv('名称提取型号.csv').reset_index(drop = True)
+        self.model1 = brandd('1122京东平安0812.xlsx')
-    def istrue(self):
+        self.brandname = self.model1.runing()
+    def istrue(self, type_name):
+        model = Extract(type_name, self.brandname)
+        type_single = model.runing()
+        if type_single != 'na': 
+            for i in range(len(self.type_table)):
+                productcode, producttype = self.type_table.iloc[i][0], self.type_table.iloc[i][1]
+                if type_single == producttype:
+                    self.productcode = productcode
+                    return True
        return False
    def productcode_get(self):
-        return 0
+        return self.productcode
 class sql_find():
    def __init__ (self):
-        self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='admin@2018@)!*', database='Try', autocommit=True)
+        self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='admin@2018@)!*', database='ZI_DataBase', autocommit=True)
        self.cursor = self.conn.cursor()
@@ -93,25 +108,3 @@ class sql_find():
        '''
        self.cursor.execute(f"delete from {database_name} where sku ='{sku}'")
-'''
-class price_import():
-    def __init__(self, table):
-        self._table = table
-        lyst = list(self._table.columns)
-        for i in range(len(lyst)):
-            if lyst[i] == '库内已有产品':
-                self._pcode_col = i
-            if lyst[i] == 'NAME':
-                self._name_col = i
-            if lyst[i] == 'jd_price':
-                self._price_col = i
-            if lyst[i] == 'create_date':
-                self._time_col = i
-            if lyst[i] == 'sku_id':
-                self._sku_col = i
-    def data_extract(self, col):
-'''
\ No newline at end of file
--- a/new.xlsx
+++ b/new.xlsx
--- a/product_type_extract.py
+++ b/product_type_extract.py
+import jieba
+import re
+import xlrd
+import pandas as pd
+import numpy as np
+class Extract:
+    def __init__(self, string_non, BrandName):
+        self.string_non = string_non
+        self.BrandName = BrandName
+    def contain_num(self, string):
+        """判断字符串string中包含数字"""
+        return bool(re.search('[0-9]', string))
+    def contain_let(self, string):
+        """判断字符串string中包含字母"""
+        return bool(re.search('[A-z]', string))
+    def contain_chin(self, string):
+        """判断字符串string中是否包含汉字"""
+        zhmodel = re.compile(u'[\u4e00-\u9fa5]')
+        match = zhmodel.search(string)
+        if match:
+            boolValue = True
+        else:
+            boolValue = False
+        return boolValue
+    def contain_sym(self, string):
+        """判断字符串string中包含'+'和'-'"""
+        return bool(re.search('[-+]', string))
+    def select_num(self, string):
+        """挑选字符串中的数字"""
+        num = re.findall(r"\d+\.?\d*", string)
+        return num[0]
+    def seek_type_maybe(self, str_single):
+        """挑选可能的型号字符串"""
+        str_goal = jieba.cut(str_single)
+        word_goal = ""
+        for word in str_goal:
+            word_goal += word + " "
+        sym = ('＋', '；', '：', '（', '(', ')', '）', "，", ',', '【', '】')
+        word_goal = ''.join(c for c in word_goal if c not in sym)  # 去掉标点符号
+        vec = word_goal.split()
+        type_maybe = {}
+        for i in range(len(vec)):
+            if vec[i] == '-':
+                type_maybe[i] = vec[i - 1] + '-' + vec[i + 1]
+            elif vec[i] == '+':
+                type_maybe[i] = vec[i - 1] + '+' + vec[i + 1]
+            else:
+                type_maybe[i] = vec[i]
+        return type_maybe
+    def select_one(self, type_in):
+        """挑选含数字的字符串
+        type_in数据类型为字典"""
+        type_out = {}
+        for key in type_in:
+            if self.contain_num(type_in[key]):
+                type_out[key] = type_in[key]
+        return type_out
+    def select_two(self, type_in):
+        """挑选含字符串长度在三到十
+        type_in数据类型为字典"""
+        type_out = {}
+        for key in type_in:
+            if (len(type_in[key]) >= 3)and(len(type_in[key]) <= 10):
+                type_out[key] = type_in[key]
+        return type_out
+    def select_three(self, type_in, type_st):
+        """挑选品牌名后面的字符串
+        返回的是一个值"""
+        type_out = []
+        for key in type_in:
+            index = key
+            if index == 1:
+                if type_st[index-1] in self.BrandName:
+                    type_out = type_in[key]
+                    break
+            elif index >= 2:
+                if (type_st[index - 1] in self.BrandName)and(type_st[index - 2] in self.BrandName):
+                    type_out = type_in[key]
+                    break
+        return type_out
+    def select_four(self, type_in, type_st):
+        """挑选型号后面的字符串"""
+        type_product = []
+        for i in range(1, len(type_in)):
+            index = list(type_in.items())[i][0]
+            if (type_st[index - 1] == '型号')and(type_st[index - 2] == '型号'):
+                type_product.append(type_st[i])
+        return list(set(type_product))
+    def select_five(self, type_mul):
+        """挑选带有'+'或者'-'的字符串"""
+        type_sin = []
+        for key in type_mul:
+            if self.contain_sym(type_mul[key]):
+                type_sin.append(type_mul[key])  # 可能有多个带有'+'或者'-'的可能型号
+                break
+        return type_sin
+    def select_six(self, type_mul):
+        """根据数字挑选字符串（重复的数字或者数字较多的字符串）"""
+        type_select = []
+        if len(type_mul) >= 2:
+            for key_i in type_mul:
+                for key_j in type_mul:
+                    num_i = self.select_num(type_mul[key_i])
+                    num_j = self.select_num(type_mul[key_j])
+                    if num_i == num_j and key_i != key_j and len(type_mul[key_i]) >= len(type_mul[key_j]):
+                        type_select = type_mul[key_j]
+                        # break
+                    else:
+                        type_len = {}
+                        for key in type_mul:
+                            type_len[key] = self.select_num(type_mul[key])
+                        index_max = max(type_len, key=type_len.get)
+                        type_select = type_mul[index_max]
+                        # type_select = type_mul
+        return type_select
+    def runing(self):
+        global type_final_out
+        output1 = self.seek_type_maybe(self.string_non)           # 字符串分词，挑选可能的型号
+        output2 = self.select_one(output1)               # 挑选含数字的字符串
+        output3 = self.select_two(output2)               # 挑选长度在三到十的字符串
+        if len(output3) == 1:
+            type_final = list(output3.items())[0][1]
+        else:
+            output4 = self.select_three(output3, output1)  # 挑选品牌名后面的字符串
+            if len(output4) == 1:
+                type_final = output4
+            else:
+                output5 = self.select_five(output3)          # 挑选带有'+'或者'-'的字符串
+                if len(output5) == 1:
+                    type_final = output5
+                else:
+                    output6 = self.select_four(output3, output1)      # 挑选型号后面的字符串
+                    if len(output6) == 1:
+                        type_final = output6
+                    else:
+                        output7 = self.select_six(output3)
+                        if len(output7) == 1:
+                            type_final = output7
+                        else:
+                            type_final =output3
+        if len(type_final) == 0:
+            type_final_out = 'na'
+        elif type(type_final) == str:
+            type_final_out = type_final
+        elif len(type_final) == 1 and type(type_final) == list:
+            type_final_out = type_final[0]
+        elif len(type_final) == 1 and type(type_final) == dict:
+            type_final_out = list(type_final.items())[0][1]
+        elif len(type_final) >= 2 and type(type_final) == dict:
+            type_len = {}
+            for key in type_final:
+                type_len[key] = len(type_final[key])
+            index = max(type_len, key=type_len.get)
+            type_final_out = type_final[index]
+        return type_final_out
+def read_in(addr_in):
+    """从excel文件中读取数据
+    addr_in为缺失的csv文件地址"""
+    datafile = xlrd.open_workbook(addr_in)
+    table = datafile.sheets()[0]
+    matrix_text = pd.DataFrame([])
+    for i in range(table.ncols):
+        matrix_text[i] = table.col_values(i)
+    matrix_text.rename(columns=matrix_text.iloc[0, :], inplace=True)
+    matrix_text.drop([0], axis=0, inplace=True)
+    type_nonstand = matrix_text[['productname']]  # 匹配的非标准数据
+    return list(np.array(type_nonstand))
+def out_data(data, addr):
+    columns = 'product_type'
+    data.insert(0, columns)
+    pd.DataFrame(data).to_csv(addr, index=0)
--- a/readme.md
+++ b/readme.md
+# 入库系统
+### 介绍
+本程序能够做到 **建库流程图.pdf** 中的两路在 "补齐重要参数" 这一步之前所有步骤。本程序由周星宇，高宇翔编写，由欧攀提供型号提取的支持。如果出现bug和使用疑问请联系周星宇，**QQ:757156922** 或者 **电话:18342204406**。
+### 目录
+1. [环境搭建](#环境搭建)
+2. [如何使用](#如何使用)
+3. [文件描述](#文件描述)
+### 环境搭建
+1. 按照requirements.txt搭建环境。
+2. **注意** moudules.py中的similar类中，由于需要查询的数据量较大，请修改更快速的查询算法。
+### 如何使用
+1.	在database.py中的main()函数第一行输入待入库的表，表应该按照表京东平安0812.xlxs的格式。
+2.	运行database.py，等待一段时间，等待输出 "完成！" 。
+3.	需要补齐重要参数的sku将存放在 补齐重要参数.txt。
+4.	需要新品建库的sku将存放在 新品建库.txt。
+5.	需要价格录入的数据将存放在 价格录入.csv中。
+6.	打开 断点记录.txt 可实时查询进度。
+### 模型下载
+1. brand_name.py为提取产品名函数。
+2. product_type_extract.py为提取型号函数。
+3. database.py为建库主程序。
+4. moudules.py为建库的支持模块。
+5. 1122京东平安0812.xlsx为提供产品**品牌依据**的表，**请不要修改名字**，如果要使用其中的数据，请克隆一份副本。
+6. new.xlsx为京东平安0812中**有对应productcode**的表，可以按需要改成需要入库的表。
+7. type_lyst.py为实验性函数，可以忽略。
+8. 名称提取型号.csv为库中型号表，除非添加新的型号对应productcode关系，**否则不要改动**。
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+pandas
+xlrd
+jieba
+numpy
+pymssql
\ No newline at end of file
--- a/type_lyst.py
+++ b/type_lyst.py
+from moudules import *
+'''
+table = pd.read_csv('名称提取型号_test.csv').reset_index(drop = True)
+print(table.iloc[0][1])
+'''
+abc = '佑游泳镜泳帽男女防水防雾游泳耳塞鼻夹眼镜高清泳镜套装Z6615电镀粉色平光'
+similar = similar()
+if similar.istrue(abc):
+    print(similar.productcode_get())
+else: print('na')
\ No newline at end of file
--- a/名称提取型号.csv
+++ b/名称提取型号.csv
--- a/名称提取型号_test.csv
+++ b/名称提取型号_test.csv
+productcode,product_type
+506003750007,FC290
+506003750008,FC298
+202006840020,DCR-PD190P
\ No newline at end of file
--- a/处理结果/价格录入.csv
+++ b/处理结果/价格录入.csv
--- a/建库流程图.pdf
+++ b/建库流程图.pdf
--- a/数据库名称.py
+++ b/数据库名称.py
+import pymssql
+import pandas as pd
+def pre(matrix_text, addr):
+    """将数据导出
+    matrix_text：导出数据
+    addr：导出地址
+    """
+    matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
+conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='ZI_DataBase')
+cur = conn.cursor()
+sql_1 = "select productcode,productname,categoryid,subcategorycode from info_product where state <> '6' "
+cur.execute(sql_1)
+data = (cur.fetchall())
+columns = [desc[0] for desc in cur.description]
+data_1 = pd.DataFrame(data, columns=columns)
+pre(data_1, '产品名称.csv')