Commit c040d93d authored by Jialin's avatar Jialin

代码最终修改

parent d54af336
......@@ -5,6 +5,7 @@
import pandas as pd
import re
import xlsxwriter
import numpy as np
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
......@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 处理缺失值
valid_index=[]
for i in df.index:
if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'产品型号'] not in invalid_list:
if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'*产品型号'] not in invalid_list:
valid_index.append(i)
result = df.loc[valid_index]
# 将df数据格式转为字符串
......@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 将df按品牌分类,按品牌提取每个型号的关键字,放入字典
brand_type = result.groupby('产品品牌')
for brand in brand_type:
result_unique = brand[1]['产品型号'].unique() # result_unique此时是array,元素是一个品牌名下的型号
result_unique = brand[1]['*产品型号'].unique() # result_unique此时是array,元素是一个品牌名下的型号
for j in range(len(result_unique)):
result_unique[j] = result_unique[j].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique() # 全部变为大写,将大小写归一,result_unique此时是array
......@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似
type_kw = {} # 用于接收品牌型号提取的关键字
for i in brand_type:
result_unique = i[1]['产品型号'].unique() # 品牌型号组成的数组,数组内无重复元素
result_unique = i[1]['*产品型号'].unique() # 品牌型号组成的数组,数组内无重复元素
for k in range(len(result_unique)):
result_unique[k]=result_unique[k].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique()
......@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
related_brand3.append(tempo_list)
#写入excel
tempo_list = [] # 将所有的相似品牌,两两一对,写入集合,放入tempo_list
method1=[]
method2=[]
method3=[]
related_brand_list=[related_brand1,related_brand2,related_brand3]
method_list=[method1,method2,method3]
for i in range(len(related_brand_list)):
for list_i in related_brand_list[i]:
tempo_list.append(set(list_i[:2]))
method_list[i].append(set(list_i[:2]))
final_list = [] # final_list就是tempo_list的去重
for item in tempo_list:
if item not in final_list:
final_list.append(item)
method=[]
for item in final_list:
linshi_list=[]
if item in method1:
linshi_list.append('1')
if item in method2:
linshi_list.append('2')
if item in method3:
linshi_list.append('3')
method.append(','.join(linshi_list))
workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
bold_format = workbook.add_format({'bold': True})
......@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
worksheet.write('D1', '方法', bold_format)
col = 0
row = 1
for list_i in related_brand1:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
for index in range(len(final_list)):
for brand_i in final_list[index]:
worksheet.write_string(row, col, brand_i)
col += 1
col= 3
worksheet.write_string(row,col,'1')
worksheet.write_string(row,col,method[index])
row += 1
col = 0
for list_i in related_brand2:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
col += 1
col = 3
worksheet.write_string(row, col, '2')
row += 1
col = 0
for list_i in related_brand3:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
col += 1
col = 3
worksheet.write_string(row, col, '3')
row += 1
col = 0
# for list_i in related_brand2:
# for brand_i in range(2):
# worksheet.write_string(row, col, list_i[brand_i])
# col += 1
# col = 3
# worksheet.write_string(row, col, '2')
# row += 1
# col = 0
#
# for list_i in related_brand3:
# for brand_i in range(2):
# worksheet.write_string(row, col, list_i[brand_i])
# col += 1
# col = 3
# worksheet.write_string(row, col, '3')
# row += 1
# col = 0
# 第二个worksheet
worksheet2 = workbook.add_worksheet(name='Sheet2')
......@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
workbook.close()
if __name__ == '__main__':
filepath = 'E:\\ZDZC\\激光打印机参数确认.xlsx'
filepath = 'E:\\ZDZC\\扫描仪参数确认.xlsx'
brand_washing(filepath)
This diff is collapsed.
......@@ -12,7 +12,7 @@ import numpy as np
import xlsxwriter
def product_washing(filepath, thre=1, a=0):
def product_washing(filepath, category,thre=1, a=0):
df_null = pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
df=pd.read_excel(filepath, converters={'产品编码':str})
......@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index:
k = brand[1].loc[i, '产品型号']
k = brand[1].loc[i, '*产品型号']
if k in invalid_list:
continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
......@@ -85,25 +85,34 @@ def product_washing(filepath, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
tempo_dict[i] = [set(combined)]
other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
other_parameters_values=[]
for parameter in other_parameters:
other_parameters_values.append(brand[1].loc[i,parameter])
tempo_dict[i].extend(other_parameters_values)
tempo_dict[i] = [set(combined), brand[1].loc[i, '*质保时间'], brand[1].loc[i, '标配外服务及配件']]
# 对比产品型号关键字,相同则放入related_product
# 对比产品参数项,相同则放入related_product
tested_product=[]
for i in tempo_dict:
for j in tempo_dict:
if i != j and set([i,j]) not in tested_product:
if tempo_dict[i][1:]==tempo_dict[j][1:]:
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][0]:
if word_i in tempo_dict[j][0]:
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
related_product.append(set([i,j]))
for index in range(1,len(tempo_dict[i])):
if tempo_dict[i][index]!= tempo_dict[j][index] and \
(tempo_dict[i][index] not in invalid_list and tempo_dict[j][index] not in invalid_list):
tested_product.append(set([i, j]))
break # 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][0]:
if word_i in tempo_dict[j][0]:
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
related_product.append(set([i,j]))
tested_product.append(set([i,j]))
# a = set([i])
......@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):
if __name__ == '__main__':
filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
product_washing(filepath)
filepath = "E:\\ZDZC\\扫描仪参数确认.xlsx"
category = '扫描仪'
product_washing(filepath,category)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment