Commit e7193e6e authored by Zhouxingyu's avatar Zhouxingyu

添加了pdf中的下路的型号比对

parent 4066f082
{
"python.pythonPath": "E:\\Python36\\python.exe"
"python.pythonPath": "D:\\Python36\\python.exe"
}
\ No newline at end of file
import jieba
import re
import xlrd
import pandas as pd
import numpy as np
class brandd:
"""提取品牌名"""
def __init__(self, addr):
self.addr = addr
def contain_let(self, string):
"""判断字符串string中包含字母"""
return bool(re.search('[A-z]', string))
def contain_chin(self, string):
"""判断字符串string中是否包含汉字"""
zhmodel = re.compile(u'[\u4e00-\u9fa5]')
match = zhmodel.search(string)
if match:
boolValue = True
else:
boolValue = False
return boolValue
def read_in_1(self, addr_in):
"""从excel文件中读取数据
addr_in为缺失的csv文件地址"""
datafile = xlrd.open_workbook(addr_in)
table = datafile.sheets()[0]
matrix_text = pd.DataFrame([])
for i in range(table.ncols):
matrix_text[i] = table.col_values(i)
matrix_text.rename(columns=matrix_text.iloc[0, :], inplace=True)
matrix_text.drop([0], axis=0, inplace=True)
type_nonstand = matrix_text[['BAND_NAME']] # 匹配的非标准数据
BrandName_data = []
for i in range(len(type_nonstand)):
string_sin = jieba.cut(type_nonstand.iloc[i, 0])
for string in string_sin:
if self.contain_let(string) or self.contain_chin(string):
BrandName_data.append(string)
return list(set(BrandName_data))
def runing(self):
brandname = self.read_in_1(self.addr)
return brandname
......@@ -3,6 +3,8 @@ import pandas as pd
sqlserver = sql_find()
similar = similar()
def logic_function(table, price_name='jd_price'):
sku_lyst = []
price_lyst = []
......@@ -12,7 +14,7 @@ def logic_function(table, price_name='jd_price'):
if productcode != 'NULL':
sku_label, symbol = left(productcode, sku, product_price) #symbol为0为没有修改的,为1为修改了的,为6为放入新品库的。
else:
sku_label, symbol = right(sku, name, product_price)
sku_label, symbol = right(sku, str(name), product_price)
sku_lyst.append(sku_label)
if symbol == 1:
price_lyst.append(m)
......@@ -53,9 +55,10 @@ def left(productcode, sku, product_price):
def right(sku, name, product_price):
if similar.istrue(name): #此处为similar类找到数据库中匹配的型号,返回True。
productcode = similar.productcode_get(name) #此处为提取与该型号所匹配产品的productcode。
left(productcode, sku)
productcode = similar.productcode_get() #此处为提取与该型号所匹配产品的productcode。
left(productcode, sku, product_price)
else:
return f'{sku}_放入新品建库'
......
import pymssql
import pandas as pd
import xlrd
from product_type_extract import *
from brand_name import brandd
import numpy as np
def data_load(path):
......@@ -23,21 +26,33 @@ def pre(matrix_text, addr):
matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
class similar():
'''
查询44w条型号速度太慢,应该对'名称提取型号.csv'的查询优化算法。
'''
def __init__(self):
pass
def istrue(self):
self.type_table = pd.read_csv('名称提取型号.csv').reset_index(drop = True)
self.model1 = brandd('1122京东平安0812.xlsx')
self.brandname = self.model1.runing()
def istrue(self, type_name):
model = Extract(type_name, self.brandname)
type_single = model.runing()
if type_single != 'na':
for i in range(len(self.type_table)):
productcode, producttype = self.type_table.iloc[i][0], self.type_table.iloc[i][1]
if type_single == producttype:
self.productcode = productcode
return True
return False
def productcode_get(self):
return 0
return self.productcode
class sql_find():
def __init__ (self):
self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='admin@2018@)!*', database='Try', autocommit=True)
self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='admin@2018@)!*', database='ZI_DataBase', autocommit=True)
self.cursor = self.conn.cursor()
......@@ -93,25 +108,3 @@ class sql_find():
'''
self.cursor.execute(f"delete from {database_name} where sku ='{sku}'")
'''
class price_import():
def __init__(self, table):
self._table = table
lyst = list(self._table.columns)
for i in range(len(lyst)):
if lyst[i] == '库内已有产品':
self._pcode_col = i
if lyst[i] == 'NAME':
self._name_col = i
if lyst[i] == 'jd_price':
self._price_col = i
if lyst[i] == 'create_date':
self._time_col = i
if lyst[i] == 'sku_id':
self._sku_col = i
def data_extract(self, col):
'''
\ No newline at end of file
No preview for this file type
import jieba
import re
import xlrd
import pandas as pd
import numpy as np
class Extract:
def __init__(self, string_non, BrandName):
self.string_non = string_non
self.BrandName = BrandName
def contain_num(self, string):
"""判断字符串string中包含数字"""
return bool(re.search('[0-9]', string))
def contain_let(self, string):
"""判断字符串string中包含字母"""
return bool(re.search('[A-z]', string))
def contain_chin(self, string):
"""判断字符串string中是否包含汉字"""
zhmodel = re.compile(u'[\u4e00-\u9fa5]')
match = zhmodel.search(string)
if match:
boolValue = True
else:
boolValue = False
return boolValue
def contain_sym(self, string):
"""判断字符串string中包含'+'和'-'"""
return bool(re.search('[-+]', string))
def select_num(self, string):
"""挑选字符串中的数字"""
num = re.findall(r"\d+\.?\d*", string)
return num[0]
def seek_type_maybe(self, str_single):
"""挑选可能的型号字符串"""
str_goal = jieba.cut(str_single)
word_goal = ""
for word in str_goal:
word_goal += word + " "
sym = ('+', ';', ':', '(', '(', ')', ')', ",", ',', '【', '】')
word_goal = ''.join(c for c in word_goal if c not in sym) # 去掉标点符号
vec = word_goal.split()
type_maybe = {}
for i in range(len(vec)):
if vec[i] == '-':
type_maybe[i] = vec[i - 1] + '-' + vec[i + 1]
elif vec[i] == '+':
type_maybe[i] = vec[i - 1] + '+' + vec[i + 1]
else:
type_maybe[i] = vec[i]
return type_maybe
def select_one(self, type_in):
"""挑选含数字的字符串
type_in数据类型为字典"""
type_out = {}
for key in type_in:
if self.contain_num(type_in[key]):
type_out[key] = type_in[key]
return type_out
def select_two(self, type_in):
"""挑选含字符串长度在三到十
type_in数据类型为字典"""
type_out = {}
for key in type_in:
if (len(type_in[key]) >= 3)and(len(type_in[key]) <= 10):
type_out[key] = type_in[key]
return type_out
def select_three(self, type_in, type_st):
"""挑选品牌名后面的字符串
返回的是一个值"""
type_out = []
for key in type_in:
index = key
if index == 1:
if type_st[index-1] in self.BrandName:
type_out = type_in[key]
break
elif index >= 2:
if (type_st[index - 1] in self.BrandName)and(type_st[index - 2] in self.BrandName):
type_out = type_in[key]
break
return type_out
def select_four(self, type_in, type_st):
"""挑选型号后面的字符串"""
type_product = []
for i in range(1, len(type_in)):
index = list(type_in.items())[i][0]
if (type_st[index - 1] == '型号')and(type_st[index - 2] == '型号'):
type_product.append(type_st[i])
return list(set(type_product))
def select_five(self, type_mul):
"""挑选带有'+'或者'-'的字符串"""
type_sin = []
for key in type_mul:
if self.contain_sym(type_mul[key]):
type_sin.append(type_mul[key]) # 可能有多个带有'+'或者'-'的可能型号
break
return type_sin
def select_six(self, type_mul):
"""根据数字挑选字符串(重复的数字或者数字较多的字符串)"""
type_select = []
if len(type_mul) >= 2:
for key_i in type_mul:
for key_j in type_mul:
num_i = self.select_num(type_mul[key_i])
num_j = self.select_num(type_mul[key_j])
if num_i == num_j and key_i != key_j and len(type_mul[key_i]) >= len(type_mul[key_j]):
type_select = type_mul[key_j]
# break
else:
type_len = {}
for key in type_mul:
type_len[key] = self.select_num(type_mul[key])
index_max = max(type_len, key=type_len.get)
type_select = type_mul[index_max]
# type_select = type_mul
return type_select
def runing(self):
global type_final_out
output1 = self.seek_type_maybe(self.string_non) # 字符串分词,挑选可能的型号
output2 = self.select_one(output1) # 挑选含数字的字符串
output3 = self.select_two(output2) # 挑选长度在三到十的字符串
if len(output3) == 1:
type_final = list(output3.items())[0][1]
else:
output4 = self.select_three(output3, output1) # 挑选品牌名后面的字符串
if len(output4) == 1:
type_final = output4
else:
output5 = self.select_five(output3) # 挑选带有'+'或者'-'的字符串
if len(output5) == 1:
type_final = output5
else:
output6 = self.select_four(output3, output1) # 挑选型号后面的字符串
if len(output6) == 1:
type_final = output6
else:
output7 = self.select_six(output3)
if len(output7) == 1:
type_final = output7
else:
type_final =output3
if len(type_final) == 0:
type_final_out = 'na'
elif type(type_final) == str:
type_final_out = type_final
elif len(type_final) == 1 and type(type_final) == list:
type_final_out = type_final[0]
elif len(type_final) == 1 and type(type_final) == dict:
type_final_out = list(type_final.items())[0][1]
elif len(type_final) >= 2 and type(type_final) == dict:
type_len = {}
for key in type_final:
type_len[key] = len(type_final[key])
index = max(type_len, key=type_len.get)
type_final_out = type_final[index]
return type_final_out
def read_in(addr_in):
"""从excel文件中读取数据
addr_in为缺失的csv文件地址"""
datafile = xlrd.open_workbook(addr_in)
table = datafile.sheets()[0]
matrix_text = pd.DataFrame([])
for i in range(table.ncols):
matrix_text[i] = table.col_values(i)
matrix_text.rename(columns=matrix_text.iloc[0, :], inplace=True)
matrix_text.drop([0], axis=0, inplace=True)
type_nonstand = matrix_text[['productname']] # 匹配的非标准数据
return list(np.array(type_nonstand))
def out_data(data, addr):
columns = 'product_type'
data.insert(0, columns)
pd.DataFrame(data).to_csv(addr, index=0)
# 入库系统
### 介绍
本程序能够做到 **建库流程图.pdf** 中的两路在 "补齐重要参数" 这一步之前所有步骤。本程序由周星宇,高宇翔编写,由欧攀提供型号提取的支持。如果出现bug和使用疑问请联系周星宇,**QQ:757156922** 或者 **电话:18342204406**
### 目录
1. [环境搭建](#环境搭建)
2. [如何使用](#如何使用)
3. [文件描述](#文件描述)
### 环境搭建
1. 按照requirements.txt搭建环境。
2. **注意** moudules.py中的similar类中,由于需要查询的数据量较大,请修改更快速的查询算法。
### 如何使用
1. 在database.py中的main()函数第一行输入待入库的表,表应该按照表京东平安0812.xlxs的格式。
2. 运行database.py,等待一段时间,等待输出 "完成!" 。
3. 需要补齐重要参数的sku将存放在 补齐重要参数.txt。
4. 需要新品建库的sku将存放在 新品建库.txt。
5. 需要价格录入的数据将存放在 价格录入.csv中。
6. 打开 断点记录.txt 可实时查询进度。
### 模型下载
1. brand_name.py为提取产品名函数。
2. product_type_extract.py为提取型号函数。
3. database.py为建库主程序。
4. moudules.py为建库的支持模块。
5. 1122京东平安0812.xlsx为提供产品**品牌依据**的表,**请不要修改名字**,如果要使用其中的数据,请克隆一份副本。
6. new.xlsx为京东平安0812中**有对应productcode**的表,可以按需要改成需要入库的表。
7. type_lyst.py为实验性函数,可以忽略。
8. 名称提取型号.csv为库中型号表,除非添加新的型号对应productcode关系,**否则不要改动**
\ No newline at end of file
pandas
xlrd
jieba
numpy
pymssql
\ No newline at end of file
from moudules import *
'''
table = pd.read_csv('名称提取型号_test.csv').reset_index(drop = True)
print(table.iloc[0][1])
'''
abc = '佑游泳镜泳帽男女防水防雾游泳耳塞鼻夹眼镜高清泳镜套装Z6615电镀粉色平光'
similar = similar()
if similar.istrue(abc):
print(similar.productcode_get())
else: print('na')
\ No newline at end of file
This diff is collapsed.
productcode,product_type
506003750007,FC290
506003750008,FC298
202006840020,DCR-PD190P
\ No newline at end of file
This diff is collapsed.
import pymssql
import pandas as pd
def pre(matrix_text, addr):
"""将数据导出
matrix_text:导出数据
addr:导出地址
"""
matrix_text.to_csv(addr, sep=',', index=0, encoding='utf_8_sig', columns=matrix_text.columns)
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='ZI_DataBase')
cur = conn.cursor()
sql_1 = "select productcode,productname,categoryid,subcategorycode from info_product where state <> '6' "
cur.execute(sql_1)
data = (cur.fetchall())
columns = [desc[0] for desc in cur.description]
data_1 = pd.DataFrame(data, columns=columns)
pre(data_1, '产品名称.csv')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment