Commit 62dacfbd authored by Jialin's avatar Jialin

代码更新

parent e9bce7fd
......@@ -8,10 +8,141 @@ import xlsxwriter
import numpy as np
import pymssql
def brand_washing_special(filepath, sheet_name, category):
df = pd.read_excel(filepath, sheet_name=sheet_name, converters={'产品编码': str})
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
invalid_list_fetch = cursor_zi_new.fetchall()
invalid_list = []
for invalid_tuple in invalid_list_fetch:
invalid_list.append(invalid_tuple[0])
# 处理缺失值
valid_index = []
if category == '复印纸':
for i in df.index:
if df.loc[i, '产品品牌'] not in invalid_list and df.loc[i, '产品系列(SPU)'] not in invalid_list:
valid_index.append(i)
result = df.loc[valid_index, ['产品品牌', '产品系列(SPU)']]
# 将df数据格式转为字符串
result = result.apply(lambda x: x.astype(str))
# def xilie_clean(x):
# x = x.strip().replace('系列','').upper()
# ch = re.findall(r'([\u4e00-\u9fa5]+)', x)
# en = re.findall(r'[0-9a-zA-Z]+', x)
# if ch and en:
# return
result['产品系列(SPU)'] = result['产品系列(SPU)'].apply(lambda x: x.strip().replace('系列', '').upper())
result_groupby = dict(list(result.groupby('产品品牌')))
brand_dict = {}
for brand in result_groupby.keys():
xilie_list = result_groupby[brand]['产品系列(SPU)'].unique().tolist()
try:
xilie_list.remove('彩色复印纸')
except:
pass
brand_dict[brand] = xilie_list
else:
for i in df.index:
if df.loc[i, '产品品牌'] not in invalid_list and df.loc[i, '*产品系列'] not in invalid_list:
valid_index.append(i)
result = df.loc[valid_index,['产品品牌','*产品系列']]
# 将df数据格式转为字符串
result = result.apply(lambda x: x.astype(str))
# def xilie_clean(x):
# x = x.strip().replace('系列','').upper()
# ch = re.findall(r'([\u4e00-\u9fa5]+)', x)
# en = re.findall(r'[0-9a-zA-Z]+', x)
# if ch and en:
# return
result['*产品系列'] = result['*产品系列'].apply(lambda x: x.strip().replace('系列','').upper())
result_groupby = dict(list(result.groupby('产品品牌')))
brand_dict={}
for brand in result_groupby.keys():
xilie_list = result_groupby[brand]['*产品系列'].unique().tolist()
brand_dict[brand] = xilie_list
related_brand=[]
tested_brand = []
repeated_xilie = []
for brand_i in brand_dict.keys():
for brand_j in brand_dict.keys():
if brand_i == brand_j or {brand_i, brand_j} in tested_brand:
continue
temp_repeated_xilie=[]
for xilie in brand_dict[brand_i]:
if xilie in brand_dict[brand_j]:
temp_repeated_xilie.append(xilie)
if temp_repeated_xilie:
repeated_xilie.append(temp_repeated_xilie)
related_brand.append([brand_i, brand_j])
tested_brand.append({brand_i, brand_j})
#输出到excel里
workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
bold_format = workbook.add_format({'bold': True})
worksheet = workbook.add_worksheet(name='Sheet1')
worksheet.write('A1', '品牌A', bold_format)
worksheet.write('B1', '品牌B', bold_format)
worksheet.write('C1', '正确品牌', bold_format)
col = 0
row = 1
for brand_list in related_brand:
for brand in brand_list:
worksheet.write_string(row, col, brand)
col += 1
row += 1
col = 0
worksheet2 = workbook.add_worksheet(name='Sheet2')
worksheet2.write('A1', '品牌A', bold_format)
worksheet2.write('B1', '品牌B', bold_format)
worksheet2.write('C1', '重复系列', bold_format)
row = 1
col = 0
for i in range(len(related_brand)):
worksheet2.write_string(row, col, related_brand[i][0])
col += 1
worksheet2.write_string(row, col, related_brand[i][1])
col += 1
worksheet2.write_string(row, col, f"{repeated_xilie[i]}")
row += 1
col = 0
worksheet3 = workbook.add_worksheet(name='Sheet3')
row = 0
col = 0
for key in brand_dict.keys():
worksheet3.write_string(row, col, key)
col += 1
worksheet3.write_string(row, col, f"{brand_dict[key]}")
col = 0
row += 1
workbook.close()
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名
df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str})
category = df.loc[1,'产品类别']
wrong_category_list = ['台式机', '笔记本', '一体电脑', '复印纸']
if category in wrong_category_list:
brand_washing_special(filepath, sheet_name, category)
return
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
......@@ -277,6 +408,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
workbook.close()
if __name__ == '__main__':
filepath = 'E:\\ZDZC\\激光打印机参数确认.xlsx'
filepath = 'E:\\ZDZC\\Sourcetree_local\\公共代码\\复印纸参数确认.xlsx'
brand_washing(filepath)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment