Commit c040d93d authored by Jialin's avatar Jialin

代码最终修改

parent d54af336
......@@ -5,6 +5,7 @@
import pandas as pd
import re
import xlsxwriter
import numpy as np
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
......@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 处理缺失值
valid_index=[]
for i in df.index:
if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'产品型号'] not in invalid_list:
if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'*产品型号'] not in invalid_list:
valid_index.append(i)
result = df.loc[valid_index]
# 将df数据格式转为字符串
......@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 将df按品牌分类,按品牌提取每个型号的关键字,放入字典
brand_type = result.groupby('产品品牌')
for brand in brand_type:
result_unique = brand[1]['产品型号'].unique() # result_unique此时是array,元素是一个品牌名下的型号
result_unique = brand[1]['*产品型号'].unique() # result_unique此时是array,元素是一个品牌名下的型号
for j in range(len(result_unique)):
result_unique[j] = result_unique[j].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique() # 全部变为大写,将大小写归一,result_unique此时是array
......@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似
type_kw = {} # 用于接收品牌型号提取的关键字
for i in brand_type:
result_unique = i[1]['产品型号'].unique() # 品牌型号组成的数组,数组内无重复元素
result_unique = i[1]['*产品型号'].unique() # 品牌型号组成的数组,数组内无重复元素
for k in range(len(result_unique)):
result_unique[k]=result_unique[k].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique()
......@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
related_brand3.append(tempo_list)
#写入excel
tempo_list = [] # 将所有的相似品牌,两两一对,写入集合,放入tempo_list
method1=[]
method2=[]
method3=[]
related_brand_list=[related_brand1,related_brand2,related_brand3]
method_list=[method1,method2,method3]
for i in range(len(related_brand_list)):
for list_i in related_brand_list[i]:
tempo_list.append(set(list_i[:2]))
method_list[i].append(set(list_i[:2]))
final_list = [] # final_list就是tempo_list的去重
for item in tempo_list:
if item not in final_list:
final_list.append(item)
method=[]
for item in final_list:
linshi_list=[]
if item in method1:
linshi_list.append('1')
if item in method2:
linshi_list.append('2')
if item in method3:
linshi_list.append('3')
method.append(','.join(linshi_list))
workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
bold_format = workbook.add_format({'bold': True})
......@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
worksheet.write('D1', '方法', bold_format)
col = 0
row = 1
for list_i in related_brand1:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
for index in range(len(final_list)):
for brand_i in final_list[index]:
worksheet.write_string(row, col, brand_i)
col += 1
col= 3
worksheet.write_string(row,col,'1')
worksheet.write_string(row,col,method[index])
row += 1
col = 0
for list_i in related_brand2:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
col += 1
col = 3
worksheet.write_string(row, col, '2')
row += 1
col = 0
for list_i in related_brand3:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
col += 1
col = 3
worksheet.write_string(row, col, '3')
row += 1
col = 0
# for list_i in related_brand2:
# for brand_i in range(2):
# worksheet.write_string(row, col, list_i[brand_i])
# col += 1
# col = 3
# worksheet.write_string(row, col, '2')
# row += 1
# col = 0
#
# for list_i in related_brand3:
# for brand_i in range(2):
# worksheet.write_string(row, col, list_i[brand_i])
# col += 1
# col = 3
# worksheet.write_string(row, col, '3')
# row += 1
# col = 0
# 第二个worksheet
worksheet2 = workbook.add_worksheet(name='Sheet2')
......@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
workbook.close()
if __name__ == '__main__':
filepath = 'E:\\ZDZC\\激光打印机参数确认.xlsx'
filepath = 'E:\\ZDZC\\扫描仪参数确认.xlsx'
brand_washing(filepath)
......@@ -9,20 +9,25 @@
import pandas as pd
import re
import numpy as np
import pymssql
import time
# category为产品类型
# a是同一品牌或父品牌下产品数量占产品总数量的百分比,作为阈值,a越大,有异常的产品越多;b和a一样,只是用于产品数据类型和参数
# c_list是产品参数中,数据类型较为统一的参数 在excel列名中的位置,从0开始,必须是一个list
def class_washing(category, filepath, c_list,a=0.02, b=0.01):
def class_washing(category, filepath, b=0.01):
df_null=pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
df = pd.read_excel(filepath,converters = {'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
t1=time.time()
print('开始检测数据类型')
dtype_minor_dict = {}
for col in df.columns:
for col in df_null['数据类型异常'][df_null['类别']==category][df_null['数据类型异常'].notnull()].values:
type_list = {}
valid_index = []
for i in df.index:
......@@ -51,6 +56,9 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
df[col] = df[col].astype(str)
# 检测产品类型错误的产品,和产品名称中不带有产品类型的产品。由于代码简单,就放在一起了
t2=time.time()
print(t2-t1)
print('开始检测错误类别和错误名称')
wrong_class = []
not_in_name = []
for i in df.index:
......@@ -59,49 +67,93 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
if category not in df.loc[i, '产品名称']:
not_in_name.append(i)
# 检测产品父品牌中品牌出现次数小于产品总数的a的产品
father_brand_minor = []
father_brand_list = []
col='产品父品牌'
valid_index=[]
for i in df.index:
if df.loc[i, col] in invalid_list:
continue
valid_index.append(i)
valid_df=df.loc[valid_index]
father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
for i in father_brand_num.index: # i 就是产品父品牌
if father_brand_num.loc[i] in father_num_list: # father_brand_num.loc[i] 就是该父品牌出现次数
father_brand_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, '产品父品牌'] in father_brand_list:
father_brand_minor.append(i)
# 检测产品品牌中品牌出现次数少的产品
brand_minor = []
brand_list = []
col = '产品品牌'
valid_index = []
# 检测品牌中是否有不在category下对应的brand_id的产品品牌
t3=time.time()
print(t3-t2)
print('开始检测错误品牌')
conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
database='ZI_NEW', autocommit=True)
cursor_zi_new = conn_zi_new.cursor()
cursor_zi_new.execute(f"select id from p_category where name='{category}'")
category_id=cursor_zi_new.fetchone()
if not category_id:
print('输入类别不在数据库中,请查证')
return
category_id = category_id[0]
cursor_zi_new.execute(f"select brandid from p_spu where categoryid={category_id}")
brand_id_fetchall=cursor_zi_new.fetchall()
brand_id_list = []
for brand_tuple in brand_id_fetchall:
brand_id_list.append(brand_tuple[0])
brand_name_list = []
for brand_id in brand_id_list:
cursor_zi_new.execute(f"select name from p_brand where id={brand_id}")
brand_name_fetch=cursor_zi_new.fetchone()
if brand_name_fetch:
brand_name_list.append(brand_name_fetch[0].strip("'"))
wrong_brand=[]
for i in df.index:
if df.loc[i, col] in invalid_list:
continue
valid_index.append(i)
valid_df=df.loc[valid_index]
brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
for i in brand_num.index:
if brand_num.loc[i] in num_list:
brand_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, '产品品牌'] in brand_list:
brand_minor.append(i)
if df.loc[i]['产品品牌'] not in brand_name_list:
wrong_brand.append(i)
# cursor_zi_new.execute(f"select id from p_brand where name='{df.loc[i]['产品品牌']}'")
# brand_id=cursor_zi_new.fetchone()
# if not brand_id:
# wrong_brand.append(i)
# continue
# brand_id=brand_id[0]
# if brand_id not in brand_id_list:
# wrong_brand.append(i)
# # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
# father_brand_minor = []
# father_brand_list = []
# col='产品父品牌'
# valid_index=[]
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
# father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
# for i in father_brand_num.index: # i 就是产品父品牌
# if father_brand_num.loc[i] in father_num_list: # father_brand_num.loc[i] 就是该父品牌出现次数
# father_brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品父品牌'] in father_brand_list:
# father_brand_minor.append(i)
#
# # 检测产品品牌中品牌出现次数少的产品
# brand_minor = []
# brand_list = []
# col = '产品品牌'
# valid_index = []
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
# num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
# for i in brand_num.index:
# if brand_num.loc[i] in num_list:
# brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品品牌'] in brand_list:
# brand_minor.append(i)
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品,7是第一个产品参数列,-2是质保时间,-1是产品型号
t4=time.time()
print(t4-t3)
print('开始检测错误长度')
length_minor_dict = {}
for col in df.columns[7:-2]:
for col in df_null['数据长度异常'][df_null['类别']==category][df_null['数据长度异常'].notnull()].values:
col_length = []
valid_index=[]
for i in df.index:
......@@ -122,9 +174,13 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
length_minor_dict[index] = col
length_minor=[]
length_minor.extend(length_minor_dict.keys())
# 检测产品参数列数据格式小于总数量的b的产品
t5=time.time()
print(t5-t4)
print('开始检测错误数据格式')
format_minor_dict = {}
for col in df.columns[7:-2]:
for col in df_null['数据格式异常'][df_null['类别']==category][df_null['数据格式异常'].notnull()].values:
counter_dict = {}
valid_index = []
for i in df.index:
......@@ -160,9 +216,8 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
elif keys_index not in format_minor_dict.keys():
format_minor_dict[keys_index] = col
format_minor=[]
format_minor.extend(format_minor_dict.keys())
format_minor=[]
format_minor.extend(format_minor_dict.keys())
# length_record = []
# for keys in counter_dict:
# if not length_record:
......@@ -172,79 +227,114 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
#
# format_minor += length_record[0][1]
# 接下来是针对扫描仪的部分.对于特定产品,还可以从每个产品参数中选出少数派。如果知道易混淆的产品类型,还要特意加上挑取易混淆产品类型的代码
# 对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
not_in_name2 = []
for i in not_in_name:
if '高拍仪' not in df.loc[i,'产品名称']:
not_in_name2.append(i)
for special_name in df_null['产品名称异常'][df_null['类别']==category][df_null['产品名称异常'].notnull()].values:
if special_name in df.loc[i,'产品名称']:
break
not_in_name2.append(i)
# 对于产品参数中,数据类型较少的参数,其中如果有数量小于产品总数量的b的,挑出来
# 对于标准产品参数中,如果有数据不在标准字典中的,挑出来
t6=time.time()
print(t6-t5)
print('开始检测标准参数')
character_minor_dict = {}
for col_i in df.columns[c_list]:
tempo_list = []
tempo_list2 = []
valid_index = []
for col_i in df_null['标准参数异常'][df_null['类别']==category][df_null['标准参数异常'].notnull()].values:
temp_list = []
cursor_zi_new.execute(f"select stdvalue from ShuJuZiDian_Cfg where categoryname='{category}' and subtitle='{col_i.strip('*')}'")
standard_value_fetchall=cursor_zi_new.fetchall()
if not standard_value_fetchall:
print(f"{col_i.strip('*')} 不在 ShuJuZiDian_Cfg,请检查。该参数项在此次运行中未被采用")
continue
standard_value=[]
for std_tuple in standard_value_fetchall:
standard_value.append(std_tuple[0])
for i in df.index:
if df.loc[i, col_i] in invalid_list:
value_col=df.loc[i, col_i]
if value_col in invalid_list:
continue
valid_index.append(i)
valid_df = df.loc[valid_index]
cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
for i in cha_num.index:
if cha_num.loc[i] in num_list:
tempo_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, col_i] in tempo_list:
tempo_list2.append(i)
character_minor_dict[col_i] = tempo_list2
if value_col not in standard_value:
temp_list.append(i)
character_minor_dict[col_i] = temp_list
character_minor = []
for keys_i in character_minor_dict:
character_minor.extend(character_minor_dict[keys_i])
# 将挑出的可疑数据整合到一起 (wrong_class没加,因为里面的肯定不对)
# cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
# num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
# for i in cha_num.index:
# if cha_num.loc[i] in num_list:
# tempo_list.append(i)
# for i in valid_df.index:
# if valid_df.loc[i, col_i] in tempo_list:
# tempo_list2.append(i)
# character_minor_dict[col_i] = tempo_list2
#
# character_minor = []
# for keys_i in character_minor_dict:
# character_minor.extend(character_minor_dict[keys_i])
t7=time.time()
print(t7-t6)
print('开始整合数据')
# 将挑出的可疑数据整合到一起
index_minor = []
index_minor.extend(wrong_class)
index_minor.extend(format_minor)
index_minor.extend(length_minor)
index_minor.extend(brand_minor)
index_minor.extend(father_brand_minor)
index_minor.extend(wrong_brand)
# index_minor.extend(father_brand_minor)
index_minor.extend(not_in_name2)
index_minor.extend(dtype_minor)
index_minor.extend(character_minor)
index_minor = set(index_minor)
final_df = pd.DataFrame(np.zeros((len(index_minor), 8)), index=list(index_minor),
columns=['计数', '产品类型异常', '产品名称异常', '父品牌异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
final_df = pd.DataFrame(np.zeros((len(index_minor), 7)), index=list(index_minor),
columns=['计数', '产品类别异常', '产品名称异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
w_class = df_null['产品类别异常权重'][df_null['类别']==category][df_null['产品类别异常权重'].notnull()].values
w_format = df_null['数据格式异常权重'][df_null['类别']==category][df_null['数据格式异常权重'].notnull()].values
w_length = df_null['数据长度异常权重'][df_null['类别']==category][df_null['数据长度异常权重'].notnull()].values
w_brand = df_null['品牌异常权重'][df_null['类别']==category][df_null['品牌异常权重'].notnull()].values
w_name = df_null['产品名称异常权重'][df_null['类别']==category][df_null['产品名称异常权重'].notnull()].values
w_dtype = df_null['数据类型异常权重'][df_null['类别']==category][df_null['数据类型异常权重'].notnull()].values
w_stdparam = df_null['标准参数异常权重'][df_null['类别']==category][df_null['标准参数异常权重'].notnull()].values
for i in index_minor:
count = 0
if i in wrong_class:
count += 1
final_df.loc[i, '产品类异常'] = 1
count += w_class
final_df.loc[i, '产品类异常'] = 1
if i in format_minor:
count += len(format_minor_dict[i].split()) # 如果该行数据有多列数据格式异常,就要加多次,
count += len(format_minor_dict[i].split()) * w_format # 如果该行数据有多列数据格式异常,就要加多次,
final_df.loc[i, '数据格式异常'] = format_minor_dict[i] # 但其中数据是空格分割的字符串,所以用split
if i in length_minor:
count += len(length_minor_dict[i].split())
count += len(length_minor_dict[i].split()) * w_length
final_df.loc[i, '数据长度异常'] = length_minor_dict[i]
if i in brand_minor:
count += 1
if i in wrong_brand:
count += w_brand
final_df.loc[i, '品牌异常'] = 1
if i in father_brand_minor:
count += 1
final_df.loc[i, '父品牌异常'] = 1
# if i in father_brand_minor:
# count += 1
# final_df.loc[i, '父品牌异常'] = 1
if i in not_in_name2:
count += 1
count += w_name
final_df.loc[i, '产品名称异常'] = 1
if i in dtype_minor:
count += len(dtype_minor_dict[i].split())
count += len(dtype_minor_dict[i].split()) * w_dtype
final_df.loc[i, '数据类型异常'] = dtype_minor_dict[i]
for keys_i in character_minor_dict:
if i in character_minor_dict[keys_i]:
final_df.loc[i, keys_i + '异常'] = 1
count += 1
count += w_stdparam
else:
final_df.loc[i, keys_i + '异常'] = 0
final_df.loc[i, '计数'] = count
......@@ -255,10 +345,10 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
if __name__ == '__main__':
category='激光打印机'
filepath="E:\\ZDZC\\激光打印机参数确认.xlsx"
c_list=[6,7,-4,-3]
category='扫描仪'
filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
#c_list=[6,7,-4,-3]
# category = '扫描仪'
# filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
# c_list=[7,8,9]
class_washing(category, filepath, c_list)
class_washing(category, filepath)
......@@ -12,7 +12,7 @@ import numpy as np
import xlsxwriter
def product_washing(filepath, thre=1, a=0):
def product_washing(filepath, category,thre=1, a=0):
df_null = pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
df=pd.read_excel(filepath, converters={'产品编码':str})
......@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index:
k = brand[1].loc[i, '产品型号']
k = brand[1].loc[i, '*产品型号']
if k in invalid_list:
continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
......@@ -85,25 +85,34 @@ def product_washing(filepath, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
tempo_dict[i] = [set(combined)]
other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
other_parameters_values=[]
for parameter in other_parameters:
other_parameters_values.append(brand[1].loc[i,parameter])
tempo_dict[i].extend(other_parameters_values)
tempo_dict[i] = [set(combined), brand[1].loc[i, '*质保时间'], brand[1].loc[i, '标配外服务及配件']]
# 对比产品型号关键字,相同则放入related_product
# 对比产品参数项,相同则放入related_product
tested_product=[]
for i in tempo_dict:
for j in tempo_dict:
if i != j and set([i,j]) not in tested_product:
if tempo_dict[i][1:]==tempo_dict[j][1:]:
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][0]:
if word_i in tempo_dict[j][0]:
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
related_product.append(set([i,j]))
for index in range(1,len(tempo_dict[i])):
if tempo_dict[i][index]!= tempo_dict[j][index] and \
(tempo_dict[i][index] not in invalid_list and tempo_dict[j][index] not in invalid_list):
tested_product.append(set([i, j]))
break # 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][0]:
if word_i in tempo_dict[j][0]:
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
related_product.append(set([i,j]))
tested_product.append(set([i,j]))
# a = set([i])
......@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):
if __name__ == '__main__':
filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
product_washing(filepath)
filepath = "E:\\ZDZC\\扫描仪参数确认.xlsx"
category = '扫描仪'
product_washing(filepath,category)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment