Commit e6658a7a authored by Jialin's avatar Jialin

李佳林

parent ed7d80ba
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: after_lijie.py
@time: 2021/03/31
@desc:
"""
import pandas as pd
import numpy as np
import re
import xlsxwriter
def pachong_washing(filepath):
df=pd.read_excel(filepath, converters={'产品编码':str})
#df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for col in ['补充后型号','爬取名称']:
df[col]=df[col].astype(str)
# 爬取名称关键词提取
comparing_df=pd.DataFrame()
for i in df.index:
df.loc[i,'爬取名称']=df.loc[i,'爬取名称'].upper()
k=df.loc[i,'爬取名称']
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = pre_num.group(1)
if not pre_num:
pre_num=''
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = num.group(1) # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num=re.findall(r'[A-Za-z]+', k)
num=''.join(num)
pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = ''.join(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
if not pos_num:
pos_num=''
comparing_df.loc[i, '爬取数据pre_num'] = pre_num
if num:
comparing_df.loc[i, '爬取数据num'] = num
# if not num:
# comparing_df.loc[i, '爬取数据alpha'] = alpha
comparing_df.loc[i, '爬取数据pos_num'] = pos_num
for i in df.index:
df.loc[i, '补充后型号'] = df.loc[i, '补充后型号'].upper()
k=df.loc[i, '补充后型号']
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = pre_num.group(1)
if not pre_num:
pre_num = ''
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = num.group(1) # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
num = ''.join(num)
pos_num = re.findall(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = ''.join(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
if not pos_num:
pos_num=''
comparing_df.loc[i, '补充型号pre_num'] = pre_num
if num:
comparing_df.loc[i, '补充型号num'] = num
if not num:
comparing_df.loc[i, '补充型号alpha'] = alpha
comparing_df.loc[i, '补充型号pos_num'] = pos_num
for i in df.index:
if comparing_df.loc[i,'补充型号pre_num'] != '' and comparing_df.loc[i,'爬取数据pre_num'] != '':
if comparing_df.loc[i,'补充型号pre_num'] != comparing_df.loc[i,'爬取数据pre_num']:
for col in df.columns[15:21]:
df.loc[i,col] = '暂无数据'
continue
if comparing_df.loc[i,'补充型号num'] != comparing_df.loc[i,'爬取数据num']:
for col in df.columns[15:21]:
df.loc[i, col] = '暂无数据'
continue
if comparing_df.loc[i,'补充型号pos_num'] != comparing_df.loc[i,'爬取数据pos_num']:
for col in df.columns[15:21]:
df.loc[i, col] = '暂无数据'
continue
df.to_excel('./after_lijie.xlsx')
if __name__ == '__main__':
filepath = "E:\\ZDZC\\扫描仪参数确认(爬虫).xlsx"
pachong_washing(filepath)
\ No newline at end of file
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: product_filter.py
@time: 2021/03/29
@desc:
"""
import pandas as pd
import re
import numpy as np
import xlsxwriter
def product_washing(filepath, thre=1, a=0):
df=pd.read_excel(filepath, converters={'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for col in df.columns:
df[col]=df[col].astype(str)
related_product = []
brand_grouped = df.groupby(by='产品品牌')
for brand in brand_grouped:
if brand[0]=='无参数,需补充':
continue
tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index:
k = brand[1].loc[i, '产品型号']
if k=='无参数,需补充':
continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = [pre_num.group(1)]
if not pre_num:
pre_num = []
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = [num.group(1)] # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
pos_num = re.findall(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = list(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
combined = pre_num + num + pos_num # 将关键字列表合并
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
# if not pre_num:
# pre_num = re.findall(r'\b([A-Za-z]{0,4})\W?\d+', k)
# if pre_num:
# pre_num = [pre_num[0]]
#
# num = re.findall(r'\d+', k) # num为数字关键字
# if num:
# num = [num[0]]
# pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
# k) # pos_num为数字后的关键字
# if pos_num:
# pos_num = list(pos_num[0])
#
# combined = pre_num + num + pos_num # 将关键字列表合并
# 提取品牌名关键字
temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper()) # 提取汉字
while '新建品牌' in temp_list1:
temp_list1.remove('新建品牌') # 去除‘新建品牌’
temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
brand_combined = temp_list1+temp_list2
while '' in combined:
combined.remove('')
tempo_dict[i] = [set(brand_combined), set(combined)]
# 对比产品型号关键字,相同则放入related_product
tested_product=[]
for i in tempo_dict:
for j in tempo_dict:
if i != j and set([i,j]) not in tested_product:
if tempo_dict[i][0]==tempo_dict[j][0]:
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][1]:
if word_i in tempo_dict[j][1]:
accuracy_i += 1/(len(tempo_dict[i][1]) + a/len(tempo_dict[i][1]))
accuracy_j += 1/(len(tempo_dict[j][1]) + a/len(tempo_dict[j][1]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
related_product.append(set([i,j]))
tested_product.append(set([i,j]))
# a = set([i])
# for j in tempo_dict:
# if tempo_dict[i] == tempo_dict[j]:
# a.add(j)
# if len(a) > 1:
# related_product.append(a)
# for i in tempo_dict:
# for j in tempo_dict:
# if i != j:
# if tempo_dict[i] == tempo_dict[j]:
# related_product.append(set([i, j]))
# # 这一步为去重
# product_unique=[]
# for item in related_product:
# if item not in product_unique:
# product_unique.append(item)
# 将数据导出到excel表格。重复的产品数据两两并列 和其他的重复数据组中间空一行
workbook = xlsxwriter.Workbook('./product_filter.xlsx')
bold_format = workbook.add_format({'bold': True})
worksheet = workbook.add_worksheet()
col = 0
row = 0
worksheet.write_string(row, col, 'Index', bold_format)
col += 1
for column in df.columns:
worksheet.write_string(row, col, column, bold_format)
col += 1
worksheet.write_string(row, col, '正确产品编号', bold_format)
row = 1
col = 0
for item in related_product:
for inner_item in item:
worksheet.write_string(row, col, str(inner_item))
for value in df.loc[inner_item].values:
col += 1
worksheet.write_string(row, col, value)
col = 0
row += 1
row += 1
workbook.close()
if __name__ == '__main__':
filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
product_washing(filepath)
#!usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import re
import xlsxwriter
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名
df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str})
# 处理缺失值
col1=(df['产品品牌'] != '暂无数据') == ((df['产品品牌'] != '无参数,需补充') == (df['产品品牌'].notnull()))
col2=(df['产品型号'] != '暂无数据') == ((df['产品型号'] != '无参数,需补充') == (df['产品型号'].notnull()))
result = df.loc[df.index[col1==col2]]
# 将df数据格式转为字符串
for i in result.columns:
result[i] = result[i].astype(str)
# 创建一个字典,key是品牌名,values是一个列表,列表元素是集合,集合元素是品牌下产品型号的关键字
standard_library = dict()
# 将df按品牌分类,按品牌提取每个型号的关键字,放入字典
brand_type = result.groupby('产品品牌')
for brand in brand_type:
result_unique = brand[1]['产品型号'].unique() # result_unique此时是array,元素是一个品牌名下的型号
for j in range(len(result_unique)):
result_unique[j] = result_unique[j].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique() # 全部变为大写,将大小写归一,result_unique此时是array
temp_list = [] # 创建一个临时列表用于收集同一品牌下每个型号提取的关键字集合
for k in result_unique:
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = [pre_num.group(1)]
if not pre_num:
pre_num = []
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = [num.group(1)] # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = list(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
combined = pre_num + num + pos_num # 将关键字列表合并
while '' in combined:
combined.remove('') # 删除空的元素
if combined:
if set(combined) not in temp_list:
temp_list.append(set(combined)) # 每一条型号的关键词列表变成一个集合,添加到临时列表中
standard_library[brand[0]] = temp_list # 将临时列表添加到字典中对应的品牌名下
# 第一种筛选方法:通过比对两个品牌下每个型号关键字重合度来判断两个品牌是否相似
related_brand1 = [] # 用于接收相似的品牌名
tested_brand = [] # 用于接收已经测试过的品牌名
for brand_i in standard_library: # brand_i为品牌名
for brand_j in standard_library:
if brand_i != brand_j: # 相同的品牌名不对比
if set([brand_i, brand_j]) not in tested_brand: # 对比过的品牌名不对比
accuracy_i = 0 # 准确率,每个品牌一更新
accuracy_j = 0
related_version=[]
for version_i in standard_library[brand_i]: # version_i为品牌下一个产品型号关键字组成的集合
for version_j in standard_library[brand_j]:
inner_accuracy_i = 0 # 内部准确率,每个型号集合一更新
inner_accuracy_j = 0
for word_i in version_i: # word_i为集合内的一个关键字
if word_i in version_j:
inner_accuracy_i += (1 / (len(version_i) + a / len(version_i))) # a为调整权重参数,默认为1
inner_accuracy_j += (1 / (len(version_i) + a / len(version_j)))
count = 0
if inner_accuracy_i >= inner_thre: # 如果两个集合内的关键字重合率达到内阈值,则增加对应准确率
accuracy_i += (
1 / (len(standard_library[brand_i]) + a / len(standard_library[brand_i])))
count += 1
if inner_accuracy_j >= inner_thre:
accuracy_j += (
1 / (len(standard_library[brand_i]) + a / len(standard_library[brand_i])))
count += 1
if count != 0:
related_version.append([' '.join(version_i), ' '.join(version_j)])
break # 如果有增加准确率,则代表两个集合已匹配成功,不再继续往下找,换brand_i下一个型号集合匹配
if accuracy_i >= thre or accuracy_j >= thre: # 如果任一准确率达到阈值,在相近品牌列表中增加这两个品牌
tempo_list=[]
tempo_list.extend([brand_i, brand_j, max(accuracy_i, accuracy_j)])
tempo_list.extend(related_version)
related_brand1.append(tempo_list)
tested_brand.append(set([brand_i, brand_j])) # 测试后,在已测试列表中增加测试过的品牌
# 第二种方法,对比两个品牌名称的关键字,只要有一个重合,就算作两个品牌相似
brand_kw = {} # 用于接收品牌名提取的关键字
for i in brand_type:
temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', i[0].upper()) # 提取汉字
while '新建品牌' in temp_list1:
temp_list1.remove('新建品牌') # 去除‘新建品牌’
temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', i[0].upper()) # 提取英文单词
brand_kw[i[0]] = temp_list1 + temp_list2
related_brand2 = [] # 相似品牌
tested_brand = [] # 测试过的品牌
for brand_i in brand_kw:
for brand_j in brand_kw:
if brand_i != brand_j:
if set([brand_i, brand_j]) not in tested_brand:
for word_i in brand_kw[brand_i]:
if word_i in brand_kw[brand_j]:
related_brand2.append([brand_i, brand_j])
else:
continue
break # 跳过两层循环,只要有一个关键字相同就代表品牌类似
tested_brand.append(set([brand_i, brand_j]))
# 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似
type_kw = {} # 用于接收品牌型号提取的关键字
for i in brand_type:
result_unique = i[1]['产品型号'].unique() # 品牌型号组成的数组,数组内无重复元素
for k in range(len(result_unique)):
result_unique[k]=result_unique[k].upper().strip()
result_unique = pd.DataFrame(result_unique)[0].unique()
type_kw[i[0]] = result_unique
related_brand3 = [] # 相似品牌
for brand_i in brand_kw:
for brand_j in type_kw:
brand_list = []
word_list = []
if brand_i != brand_j:
for word_i in brand_kw[brand_i]:
for word_j in type_kw[brand_j]:
if word_i in word_j:
brand_list.extend([brand_i, brand_j])
word_list.append([word_i, word_j])
if brand_list:
tempo_list=brand_list[:2]
tempo_list.extend(word_list)
related_brand3.append(tempo_list)
# 写入excel
workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
bold_format = workbook.add_format({'bold': True})
# 第一个worksheet
worksheet = workbook.add_worksheet(name='Sheet1')
worksheet.write('A1', '品牌A-1', bold_format)
worksheet.write('B1', '品牌B-1', bold_format)
worksheet.write('C1', '品牌-1', bold_format)
worksheet.write('D1', '品牌A-2', bold_format)
worksheet.write('E1', '品牌B-2', bold_format)
worksheet.write('F1', '品牌-2', bold_format)
worksheet.write('G1', '品牌A-3', bold_format)
worksheet.write('H1', '品牌B-3', bold_format)
worksheet.write('I1', '品牌-3', bold_format)
col = 0
row = 1
for list_i in related_brand1:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
col += 1
row += 1
col = 0
col = 3
row = 1
for list_i in related_brand2:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
col += 1
row += 1
col = 3
col = 6
row = 1
for list_i in related_brand3:
for brand_i in range(2):
worksheet.write_string(row, col, list_i[brand_i])
col += 1
row += 1
col = 6
# 第二个worksheet
worksheet2 = workbook.add_worksheet(name='Sheet2')
col = 0
row = 0
for list_i in related_brand1:
for brand_i in list_i[:2]:
worksheet2.write_string(row, col, brand_i)
row += 1
row+=1
row=0
for list_i in related_brand1:
col = 1
for version_list in list_i[3:]:
for version_i in version_list:
worksheet2.write_string(row, col, version_i)
row += 1
row -= 2
col += 1
row += 3
col=0
row_marker=row
for list_i in related_brand3:
for brand_i in list_i[:2]:
worksheet2.write_string(row, col, brand_i)
row += 1
row+=1
row=row_marker
for list_i in related_brand3:
col = 1
for version_list in list_i[2:]:
for version_i in version_list:
worksheet2.write_string(row, col, version_i)
row += 1
row -= 2
col += 1
row += 3
workbook.close()
if __name__ == '__main__':
filepath = 'E:\\ZDZC\\激光打印机参数确认.xlsx'
brand_washing(filepath)
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: class_washing.py
@time: 2021/03/26
@desc:
"""
import pandas as pd
import re
import numpy as np
# category为产品类型
# a是同一品牌或父品牌下产品数量占产品总数量的百分比,作为阈值,a越大,有异常的产品越多;b和a一样,只是用于产品数据类型和参数
# c_list是产品参数中,数据类型较为统一的参数 在excel列名中的位置,从0开始,必须是一个list
def class_washing(category, filepath, c_list,a=0.02, b=0.01):
df = pd.read_excel(filepath,converters = {'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
dtype_minor = []
for col in df.columns:
type_list = {}
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
data_type = type(df.loc[i, col])
if data_type not in type_list:
type_list[data_type] = 1
elif data_type in type_list:
type_list[data_type] += 1
for data_type_i in type_list:
if type_list[data_type_i] < len(valid_index) * b:
for i in valid_index:
if type(df.loc[i][col]) == data_type_i:
dtype_minor.append(i)
# 在检测完产品数据类型后,将所有数据类型转换为string
for col in df.columns:
df[col] = df[col].astype(str)
# 检测产品类型错误的产品,和产品名称中不带有产品类型的产品。由于代码简单,就放在一起了
wrong_class = []
not_in_name = []
for i in df.index:
if df.loc[i, '产品类别'] != category:
wrong_class.append(i)
if category not in df.loc[i, '产品名称']:
not_in_name.append(i)
# 检测产品父品牌中品牌出现次数小于产品总数的a的产品
father_brand_minor = []
father_brand_list = []
col='产品父品牌'
valid_df=df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]]
father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
for i in father_brand_num.index: # i 就是产品父品牌
if father_brand_num.loc[i] in father_num_list: # father_brand_num.loc[i] 就是该父品牌出现次数
father_brand_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, '产品父品牌'] in father_brand_list:
father_brand_minor.append(i)
# 检测产品品牌中品牌出现次数少的产品
brand_minor = []
brand_list = []
col = '产品品牌'
valid_df = df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]]
brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
for i in brand_num.index:
if brand_num.loc[i] in num_list:
brand_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, '产品品牌'] in brand_list:
brand_minor.append(i)
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品,7是第一个产品参数列,-2是质保时间,-1是产品型号
length_minor = []
for col in df.columns[7:-2]:
col_length = []
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
col_length.append(len(df.loc[i, col]))
std = np.array(col_length).std()
mean = np.array(col_length).mean()
for counter, length in enumerate(col_length):
if length < mean - 2 * std or length > mean + 2 * std:
length_minor.append(valid_index[counter])
# 检测产品参数列数据格式小于总数量的b的产品
format_minor = []
for col in df.columns[7:-2]:
counter_dict = {}
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
counter_list = []
k = df.loc[i, col]
is_str = re.findall(r'[A-Za-z]+', k)
is_dig = re.findall(r'[0-9]+', k)
is_special = re.findall(r'\W+', k)
is_chinese = re.findall(r'[\u4e00-\u9fa5]+', k)
if is_str:
counter_list.append('str')
if is_dig:
counter_list.append('dig')
if is_special:
counter_list.append('special')
if is_chinese:
counter_list.append('chinese')
combined = ''.join(counter_list)
if combined not in counter_dict:
counter_dict[combined] = [i]
elif combined in counter_dict:
counter_dict[combined].append(i)
for keys in counter_dict:
if len(counter_dict[keys]) < len(valid_index)*b:
format_minor.extend(counter_dict[keys])
# length_record = []
# for keys in counter_dict:
# if not length_record:
# length_record.append([len(counter_dict[keys]), counter_dict[keys]])
# elif len(counter_dict[keys]) < length_record[0][0]:
# length_record[0] = [len(counter_dict[keys]), counter_dict[keys]]
#
# format_minor += length_record[0][1]
# 接下来是针对扫描仪的部分.对于特定产品,还可以从每个产品参数中选出少数派。如果知道易混淆的产品类型,还要特意加上挑取易混淆产品类型的代码
# 对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
not_in_name2 = []
for i in not_in_name:
if '高拍仪' not in df.loc[i,'产品名称']:
not_in_name2.append(i)
# 对于产品参数中,数据类型较少的参数,其中如果有数量小于产品总数量的b的,挑出来
character_minor_dict = {}
for col_i in df.columns[c_list]:
tempo_list = []
tempo_list2 = []
valid_df=df.loc[df.index[(df[col_i] != '暂无数据') == ((df[col_i] != '无参数,需补充') == (df[col_i].notnull()))]]
cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
for i in cha_num.index:
if cha_num.loc[i] in num_list:
tempo_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, col_i] in tempo_list:
tempo_list2.append(i)
character_minor_dict[col_i] = tempo_list2
character_minor = []
for keys_i in character_minor_dict:
character_minor.extend(character_minor_dict[keys_i])
# 将挑出的可疑数据整合到一起 (wrong_class没加,因为里面的肯定不对)
index_minor = []
index_minor.extend(wrong_class)
index_minor.extend(format_minor)
index_minor.extend(length_minor)
index_minor.extend(brand_minor)
index_minor.extend(father_brand_minor)
index_minor.extend(not_in_name2)
index_minor.extend(dtype_minor)
index_minor.extend(character_minor)
index_minor = set(index_minor)
final_df = pd.DataFrame(np.zeros((len(index_minor), 8)), index=list(index_minor),
columns=['计数', '产品类型异常', '产品名称异常', '父品牌异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
for i in index_minor:
count = 0
if i in wrong_class:
count += 1
final_df.loc[i, '产品类型异常'] = 1
if i in format_minor:
count += 1
final_df.loc[i, '数据格式异常'] = 1
if i in length_minor:
count += 1
final_df.loc[i, '数据长度异常'] = 1
if i in brand_minor:
count += 1
final_df.loc[i, '品牌异常'] = 1
if i in father_brand_minor:
count += 1
final_df.loc[i, '父品牌异常'] = 1
if i in not_in_name2:
count += 1
final_df.loc[i, '产品名称异常'] = 1
if i in dtype_minor:
count += 1
final_df.loc[i, '数据类型异常'] = 1
for keys_i in character_minor_dict:
if i in character_minor_dict[keys_i]:
final_df.loc[i, keys_i + '异常'] = 1
count += 1
else:
final_df.loc[i, keys_i + '异常'] = 0
final_df.loc[i, '计数'] = count
final_df = pd.merge(final_df, df, how='left', left_index=True, right_index=True)
final_df = final_df.sort_values(by='计数', ascending=False)
final_df.to_excel("./class_filter.xlsx")
if __name__ == '__main__':
category='激光打印机'
filepath="E:\\ZDZC\\激光打印机参数确认.xlsx"
c_list=[6,7,-4,-3]
class_washing(category, filepath, c_list)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment