Commit e6658a7a authored by Jialin's avatar Jialin

李佳林

parent ed7d80ba
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: after_lijie.py
@time: 2021/03/31
@desc:
"""
import pandas as pd
import numpy as np
import re
import xlsxwriter
def pachong_washing(filepath):
df=pd.read_excel(filepath, converters={'产品编码':str})
#df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for col in ['补充后型号','爬取名称']:
df[col]=df[col].astype(str)
# 爬取名称关键词提取
comparing_df=pd.DataFrame()
for i in df.index:
df.loc[i,'爬取名称']=df.loc[i,'爬取名称'].upper()
k=df.loc[i,'爬取名称']
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = pre_num.group(1)
if not pre_num:
pre_num=''
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = num.group(1) # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num=re.findall(r'[A-Za-z]+', k)
num=''.join(num)
pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = ''.join(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
if not pos_num:
pos_num=''
comparing_df.loc[i, '爬取数据pre_num'] = pre_num
if num:
comparing_df.loc[i, '爬取数据num'] = num
# if not num:
# comparing_df.loc[i, '爬取数据alpha'] = alpha
comparing_df.loc[i, '爬取数据pos_num'] = pos_num
for i in df.index:
df.loc[i, '补充后型号'] = df.loc[i, '补充后型号'].upper()
k=df.loc[i, '补充后型号']
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = pre_num.group(1)
if not pre_num:
pre_num = ''
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = num.group(1) # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
num = ''.join(num)
pos_num = re.findall(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = ''.join(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
if not pos_num:
pos_num=''
comparing_df.loc[i, '补充型号pre_num'] = pre_num
if num:
comparing_df.loc[i, '补充型号num'] = num
if not num:
comparing_df.loc[i, '补充型号alpha'] = alpha
comparing_df.loc[i, '补充型号pos_num'] = pos_num
for i in df.index:
if comparing_df.loc[i,'补充型号pre_num'] != '' and comparing_df.loc[i,'爬取数据pre_num'] != '':
if comparing_df.loc[i,'补充型号pre_num'] != comparing_df.loc[i,'爬取数据pre_num']:
for col in df.columns[15:21]:
df.loc[i,col] = '暂无数据'
continue
if comparing_df.loc[i,'补充型号num'] != comparing_df.loc[i,'爬取数据num']:
for col in df.columns[15:21]:
df.loc[i, col] = '暂无数据'
continue
if comparing_df.loc[i,'补充型号pos_num'] != comparing_df.loc[i,'爬取数据pos_num']:
for col in df.columns[15:21]:
df.loc[i, col] = '暂无数据'
continue
df.to_excel('./after_lijie.xlsx')
if __name__ == '__main__':
filepath = "E:\\ZDZC\\扫描仪参数确认(爬虫).xlsx"
pachong_washing(filepath)
\ No newline at end of file
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: product_filter.py
@time: 2021/03/29
@desc:
"""
import pandas as pd
import re
import numpy as np
import xlsxwriter
def product_washing(filepath, thre=1, a=0):
df=pd.read_excel(filepath, converters={'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for col in df.columns:
df[col]=df[col].astype(str)
related_product = []
brand_grouped = df.groupby(by='产品品牌')
for brand in brand_grouped:
if brand[0]=='无参数,需补充':
continue
tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index:
k = brand[1].loc[i, '产品型号']
if k=='无参数,需补充':
continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = [pre_num.group(1)]
if not pre_num:
pre_num = []
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = [num.group(1)] # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
pos_num = re.findall(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = list(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
combined = pre_num + num + pos_num # 将关键字列表合并
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
# if not pre_num:
# pre_num = re.findall(r'\b([A-Za-z]{0,4})\W?\d+', k)
# if pre_num:
# pre_num = [pre_num[0]]
#
# num = re.findall(r'\d+', k) # num为数字关键字
# if num:
# num = [num[0]]
# pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
# k) # pos_num为数字后的关键字
# if pos_num:
# pos_num = list(pos_num[0])
#
# combined = pre_num + num + pos_num # 将关键字列表合并
# 提取品牌名关键字
temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper()) # 提取汉字
while '新建品牌' in temp_list1:
temp_list1.remove('新建品牌') # 去除‘新建品牌’
temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
brand_combined = temp_list1+temp_list2
while '' in combined:
combined.remove('')
tempo_dict[i] = [set(brand_combined), set(combined)]
# 对比产品型号关键字,相同则放入related_product
tested_product=[]
for i in tempo_dict:
for j in tempo_dict:
if i != j and set([i,j]) not in tested_product:
if tempo_dict[i][0]==tempo_dict[j][0]:
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][1]:
if word_i in tempo_dict[j][1]:
accuracy_i += 1/(len(tempo_dict[i][1]) + a/len(tempo_dict[i][1]))
accuracy_j += 1/(len(tempo_dict[j][1]) + a/len(tempo_dict[j][1]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
related_product.append(set([i,j]))
tested_product.append(set([i,j]))
# a = set([i])
# for j in tempo_dict:
# if tempo_dict[i] == tempo_dict[j]:
# a.add(j)
# if len(a) > 1:
# related_product.append(a)
# for i in tempo_dict:
# for j in tempo_dict:
# if i != j:
# if tempo_dict[i] == tempo_dict[j]:
# related_product.append(set([i, j]))
# # 这一步为去重
# product_unique=[]
# for item in related_product:
# if item not in product_unique:
# product_unique.append(item)
# 将数据导出到excel表格。重复的产品数据两两并列 和其他的重复数据组中间空一行
workbook = xlsxwriter.Workbook('./product_filter.xlsx')
bold_format = workbook.add_format({'bold': True})
worksheet = workbook.add_worksheet()
col = 0
row = 0
worksheet.write_string(row, col, 'Index', bold_format)
col += 1
for column in df.columns:
worksheet.write_string(row, col, column, bold_format)
col += 1
worksheet.write_string(row, col, '正确产品编号', bold_format)
row = 1
col = 0
for item in related_product:
for inner_item in item:
worksheet.write_string(row, col, str(inner_item))
for value in df.loc[inner_item].values:
col += 1
worksheet.write_string(row, col, value)
col = 0
row += 1
row += 1
workbook.close()
if __name__ == '__main__':
filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
product_washing(filepath)
This diff is collapsed.
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: class_washing.py
@time: 2021/03/26
@desc:
"""
import pandas as pd
import re
import numpy as np
# category为产品类型
# a是同一品牌或父品牌下产品数量占产品总数量的百分比,作为阈值,a越大,有异常的产品越多;b和a一样,只是用于产品数据类型和参数
# c_list是产品参数中,数据类型较为统一的参数 在excel列名中的位置,从0开始,必须是一个list
def class_washing(category, filepath, c_list,a=0.02, b=0.01):
df = pd.read_excel(filepath,converters = {'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
dtype_minor = []
for col in df.columns:
type_list = {}
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
data_type = type(df.loc[i, col])
if data_type not in type_list:
type_list[data_type] = 1
elif data_type in type_list:
type_list[data_type] += 1
for data_type_i in type_list:
if type_list[data_type_i] < len(valid_index) * b:
for i in valid_index:
if type(df.loc[i][col]) == data_type_i:
dtype_minor.append(i)
# 在检测完产品数据类型后,将所有数据类型转换为string
for col in df.columns:
df[col] = df[col].astype(str)
# 检测产品类型错误的产品,和产品名称中不带有产品类型的产品。由于代码简单,就放在一起了
wrong_class = []
not_in_name = []
for i in df.index:
if df.loc[i, '产品类别'] != category:
wrong_class.append(i)
if category not in df.loc[i, '产品名称']:
not_in_name.append(i)
# 检测产品父品牌中品牌出现次数小于产品总数的a的产品
father_brand_minor = []
father_brand_list = []
col='产品父品牌'
valid_df=df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]]
father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
for i in father_brand_num.index: # i 就是产品父品牌
if father_brand_num.loc[i] in father_num_list: # father_brand_num.loc[i] 就是该父品牌出现次数
father_brand_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, '产品父品牌'] in father_brand_list:
father_brand_minor.append(i)
# 检测产品品牌中品牌出现次数少的产品
brand_minor = []
brand_list = []
col = '产品品牌'
valid_df = df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]]
brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
for i in brand_num.index:
if brand_num.loc[i] in num_list:
brand_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, '产品品牌'] in brand_list:
brand_minor.append(i)
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品,7是第一个产品参数列,-2是质保时间,-1是产品型号
length_minor = []
for col in df.columns[7:-2]:
col_length = []
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
col_length.append(len(df.loc[i, col]))
std = np.array(col_length).std()
mean = np.array(col_length).mean()
for counter, length in enumerate(col_length):
if length < mean - 2 * std or length > mean + 2 * std:
length_minor.append(valid_index[counter])
# 检测产品参数列数据格式小于总数量的b的产品
format_minor = []
for col in df.columns[7:-2]:
counter_dict = {}
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
counter_list = []
k = df.loc[i, col]
is_str = re.findall(r'[A-Za-z]+', k)
is_dig = re.findall(r'[0-9]+', k)
is_special = re.findall(r'\W+', k)
is_chinese = re.findall(r'[\u4e00-\u9fa5]+', k)
if is_str:
counter_list.append('str')
if is_dig:
counter_list.append('dig')
if is_special:
counter_list.append('special')
if is_chinese:
counter_list.append('chinese')
combined = ''.join(counter_list)
if combined not in counter_dict:
counter_dict[combined] = [i]
elif combined in counter_dict:
counter_dict[combined].append(i)
for keys in counter_dict:
if len(counter_dict[keys]) < len(valid_index)*b:
format_minor.extend(counter_dict[keys])
# length_record = []
# for keys in counter_dict:
# if not length_record:
# length_record.append([len(counter_dict[keys]), counter_dict[keys]])
# elif len(counter_dict[keys]) < length_record[0][0]:
# length_record[0] = [len(counter_dict[keys]), counter_dict[keys]]
#
# format_minor += length_record[0][1]
# 接下来是针对扫描仪的部分.对于特定产品,还可以从每个产品参数中选出少数派。如果知道易混淆的产品类型,还要特意加上挑取易混淆产品类型的代码
# 对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
not_in_name2 = []
for i in not_in_name:
if '高拍仪' not in df.loc[i,'产品名称']:
not_in_name2.append(i)
# 对于产品参数中,数据类型较少的参数,其中如果有数量小于产品总数量的b的,挑出来
character_minor_dict = {}
for col_i in df.columns[c_list]:
tempo_list = []
tempo_list2 = []
valid_df=df.loc[df.index[(df[col_i] != '暂无数据') == ((df[col_i] != '无参数,需补充') == (df[col_i].notnull()))]]
cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
for i in cha_num.index:
if cha_num.loc[i] in num_list:
tempo_list.append(i)
for i in valid_df.index:
if valid_df.loc[i, col_i] in tempo_list:
tempo_list2.append(i)
character_minor_dict[col_i] = tempo_list2
character_minor = []
for keys_i in character_minor_dict:
character_minor.extend(character_minor_dict[keys_i])
# 将挑出的可疑数据整合到一起 (wrong_class没加,因为里面的肯定不对)
index_minor = []
index_minor.extend(wrong_class)
index_minor.extend(format_minor)
index_minor.extend(length_minor)
index_minor.extend(brand_minor)
index_minor.extend(father_brand_minor)
index_minor.extend(not_in_name2)
index_minor.extend(dtype_minor)
index_minor.extend(character_minor)
index_minor = set(index_minor)
final_df = pd.DataFrame(np.zeros((len(index_minor), 8)), index=list(index_minor),
columns=['计数', '产品类型异常', '产品名称异常', '父品牌异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
for i in index_minor:
count = 0
if i in wrong_class:
count += 1
final_df.loc[i, '产品类型异常'] = 1
if i in format_minor:
count += 1
final_df.loc[i, '数据格式异常'] = 1
if i in length_minor:
count += 1
final_df.loc[i, '数据长度异常'] = 1
if i in brand_minor:
count += 1
final_df.loc[i, '品牌异常'] = 1
if i in father_brand_minor:
count += 1
final_df.loc[i, '父品牌异常'] = 1
if i in not_in_name2:
count += 1
final_df.loc[i, '产品名称异常'] = 1
if i in dtype_minor:
count += 1
final_df.loc[i, '数据类型异常'] = 1
for keys_i in character_minor_dict:
if i in character_minor_dict[keys_i]:
final_df.loc[i, keys_i + '异常'] = 1
count += 1
else:
final_df.loc[i, keys_i + '异常'] = 0
final_df.loc[i, '计数'] = count
final_df = pd.merge(final_df, df, how='left', left_index=True, right_index=True)
final_df = final_df.sort_values(by='计数', ascending=False)
final_df.to_excel("./class_filter.xlsx")
if __name__ == '__main__':
category='激光打印机'
filepath="E:\\ZDZC\\激光打印机参数确认.xlsx"
c_list=[6,7,-4,-3]
class_washing(category, filepath, c_list)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment