Commit c040d93d authored by Jialin's avatar Jialin

代码最终修改

parent d54af336
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
import pandas as pd import pandas as pd
import re import re
import xlsxwriter import xlsxwriter
import numpy as np
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
...@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): ...@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 处理缺失值 # 处理缺失值
valid_index=[] valid_index=[]
for i in df.index: for i in df.index:
if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'产品型号'] not in invalid_list: if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'*产品型号'] not in invalid_list:
valid_index.append(i) valid_index.append(i)
result = df.loc[valid_index] result = df.loc[valid_index]
# 将df数据格式转为字符串 # 将df数据格式转为字符串
...@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): ...@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 将df按品牌分类,按品牌提取每个型号的关键字,放入字典 # 将df按品牌分类,按品牌提取每个型号的关键字,放入字典
brand_type = result.groupby('产品品牌') brand_type = result.groupby('产品品牌')
for brand in brand_type: for brand in brand_type:
result_unique = brand[1]['产品型号'].unique() # result_unique此时是array,元素是一个品牌名下的型号 result_unique = brand[1]['*产品型号'].unique() # result_unique此时是array,元素是一个品牌名下的型号
for j in range(len(result_unique)): for j in range(len(result_unique)):
result_unique[j] = result_unique[j].upper().strip() result_unique[j] = result_unique[j].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique() # 全部变为大写,将大小写归一,result_unique此时是array result_unique = pd.DataFrame(result_unique)[0].unique() # 全部变为大写,将大小写归一,result_unique此时是array
...@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): ...@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似 # 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似
type_kw = {} # 用于接收品牌型号提取的关键字 type_kw = {} # 用于接收品牌型号提取的关键字
for i in brand_type: for i in brand_type:
result_unique = i[1]['产品型号'].unique() # 品牌型号组成的数组,数组内无重复元素 result_unique = i[1]['*产品型号'].unique() # 品牌型号组成的数组,数组内无重复元素
for k in range(len(result_unique)): for k in range(len(result_unique)):
result_unique[k]=result_unique[k].upper().strip() result_unique[k]=result_unique[k].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique() result_unique = pd.DataFrame(result_unique)[0].unique()
...@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): ...@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
related_brand3.append(tempo_list) related_brand3.append(tempo_list)
#写入excel #写入excel
tempo_list = [] # 将所有的相似品牌,两两一对,写入集合,放入tempo_list
method1=[]
method2=[]
method3=[]
related_brand_list=[related_brand1,related_brand2,related_brand3]
method_list=[method1,method2,method3]
for i in range(len(related_brand_list)):
for list_i in related_brand_list[i]:
tempo_list.append(set(list_i[:2]))
method_list[i].append(set(list_i[:2]))
final_list = [] # final_list就是tempo_list的去重
for item in tempo_list:
if item not in final_list:
final_list.append(item)
method=[]
for item in final_list:
linshi_list=[]
if item in method1:
linshi_list.append('1')
if item in method2:
linshi_list.append('2')
if item in method3:
linshi_list.append('3')
method.append(','.join(linshi_list))
workbook = xlsxwriter.Workbook('./brand_filter.xlsx') workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
bold_format = workbook.add_format({'bold': True}) bold_format = workbook.add_format({'bold': True})
...@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): ...@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
worksheet.write('D1', '方法', bold_format) worksheet.write('D1', '方法', bold_format)
col = 0 col = 0
row = 1 row = 1
for list_i in related_brand1: for index in range(len(final_list)):
for brand_i in range(2): for brand_i in final_list[index]:
worksheet.write_string(row, col, list_i[brand_i]) worksheet.write_string(row, col, brand_i)
col += 1 col += 1
col= 3 col= 3
worksheet.write_string(row,col,'1') worksheet.write_string(row,col,method[index])
row += 1 row += 1
col = 0 col = 0
for list_i in related_brand2: # for list_i in related_brand2:
for brand_i in range(2): # for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i]) # worksheet.write_string(row, col, list_i[brand_i])
col += 1 # col += 1
col = 3 # col = 3
worksheet.write_string(row, col, '2') # worksheet.write_string(row, col, '2')
row += 1 # row += 1
col = 0 # col = 0
#
for list_i in related_brand3: # for list_i in related_brand3:
for brand_i in range(2): # for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i]) # worksheet.write_string(row, col, list_i[brand_i])
col += 1 # col += 1
col = 3 # col = 3
worksheet.write_string(row, col, '3') # worksheet.write_string(row, col, '3')
row += 1 # row += 1
col = 0 # col = 0
# 第二个worksheet # 第二个worksheet
worksheet2 = workbook.add_worksheet(name='Sheet2') worksheet2 = workbook.add_worksheet(name='Sheet2')
...@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0): ...@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
workbook.close() workbook.close()
if __name__ == '__main__': if __name__ == '__main__':
filepath = 'E:\\ZDZC\\激光打印机参数确认.xlsx' filepath = 'E:\\ZDZC\\扫描仪参数确认.xlsx'
brand_washing(filepath) brand_washing(filepath)
This diff is collapsed.
...@@ -12,7 +12,7 @@ import numpy as np ...@@ -12,7 +12,7 @@ import numpy as np
import xlsxwriter import xlsxwriter
def product_washing(filepath, thre=1, a=0): def product_washing(filepath, category,thre=1, a=0):
df_null = pd.read_excel(".\\异常数据表格.xlsx") df_null = pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values invalid_list = df_null['异常数据名称'].values
df=pd.read_excel(filepath, converters={'产品编码':str}) df=pd.read_excel(filepath, converters={'产品编码':str})
...@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0): ...@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
tempo_dict = {} tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict # 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index: for i in brand[1].index:
k = brand[1].loc[i, '产品型号'] k = brand[1].loc[i, '*产品型号']
if k in invalid_list: if k in invalid_list:
continue continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字 pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
...@@ -85,25 +85,34 @@ def product_washing(filepath, thre=1, a=0): ...@@ -85,25 +85,34 @@ def product_washing(filepath, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词 # temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2 # brand_combined = temp_list1+temp_list2
tempo_dict[i] = [set(combined)]
other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
other_parameters_values=[]
for parameter in other_parameters:
other_parameters_values.append(brand[1].loc[i,parameter])
tempo_dict[i].extend(other_parameters_values)
tempo_dict[i] = [set(combined), brand[1].loc[i, '*质保时间'], brand[1].loc[i, '标配外服务及配件']]
# 对比产品型号关键字,相同则放入related_product # 对比产品参数项,相同则放入related_product
tested_product=[] tested_product=[]
for i in tempo_dict: for i in tempo_dict:
for j in tempo_dict: for j in tempo_dict:
if i != j and set([i,j]) not in tested_product: if i != j and set([i,j]) not in tested_product:
if tempo_dict[i][1:]==tempo_dict[j][1:]: for index in range(1,len(tempo_dict[i])):
accuracy_i=0 if tempo_dict[i][index]!= tempo_dict[j][index] and \
accuracy_j=0 (tempo_dict[i][index] not in invalid_list and tempo_dict[j][index] not in invalid_list):
for word_i in tempo_dict[i][0]: tested_product.append(set([i, j]))
if word_i in tempo_dict[j][0]: break # 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0])) accuracy_i=0
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0])) accuracy_j=0
if accuracy_i >= thre or accuracy_j >= thre: for word_i in tempo_dict[i][0]:
if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')): if word_i in tempo_dict[j][0]:
related_product.append(set([i,j])) accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
related_product.append(set([i,j]))
tested_product.append(set([i,j])) tested_product.append(set([i,j]))
# a = set([i]) # a = set([i])
...@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0): ...@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):
if __name__ == '__main__': if __name__ == '__main__':
filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx" filepath = "E:\\ZDZC\\扫描仪参数确认.xlsx"
product_washing(filepath) category = '扫描仪'
product_washing(filepath,category)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment