Commit d54af336 authored by Jialin's avatar Jialin

代码修改

parent a5316846
......@@ -3,7 +3,6 @@
import pandas as pd
import numpy as np
import re
import xlsxwriter
......@@ -11,10 +10,14 @@ import xlsxwriter
def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名
df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str})
df_null = pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
# 处理缺失值
col1=(df['产品品牌'] != '暂无数据') == ((df['产品品牌'] != '无参数,需补充') == (df['产品品牌'].notnull()))
col2=(df['产品型号'] != '暂无数据') == ((df['产品型号'] != '无参数,需补充') == (df['产品型号'].notnull()))
result = df.loc[df.index[col1==col2]]
valid_index=[]
for i in df.index:
if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'产品型号'] not in invalid_list:
valid_index.append(i)
result = df.loc[valid_index]
# 将df数据格式转为字符串
for i in result.columns:
result[i] = result[i].astype(str)
......@@ -152,7 +155,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
tempo_list.extend(word_list)
related_brand3.append(tempo_list)
# 写入excel
#写入excel
workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
bold_format = workbook.add_format({'bold': True})
......@@ -162,11 +165,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
worksheet.write('B1', '品牌B', bold_format)
worksheet.write('C1', '正确品牌', bold_format)
worksheet.write('D1', '方法', bold_format)
# worksheet.write('E1', '品牌B-2', bold_format)
# worksheet.write('F1', '品牌-2', bold_format)
# worksheet.write('G1', '品牌A-3', bold_format)
# worksheet.write('H1', '品牌B-3', bold_format)
# worksheet.write('I1', '品牌-3', bold_format)
col = 0
row = 1
for list_i in related_brand1:
......@@ -238,9 +236,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
row -= 2
col += 1
row += 3
workbook.close()
if __name__ == '__main__':
......
......@@ -16,14 +16,19 @@ import numpy as np
def class_washing(category, filepath, c_list,a=0.02, b=0.01):
df_null=pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
df = pd.read_excel(filepath,converters = {'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
dtype_minor_dict = {}
for col in df.columns:
type_list = {}
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
valid_index = []
for i in df.index:
if df.loc[i,col] in invalid_list:
continue
valid_index.append(i)
data_type = type(df.loc[i, col])
if data_type not in type_list:
type_list[data_type] = 1
......@@ -58,7 +63,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
father_brand_minor = []
father_brand_list = []
col='产品父品牌'
valid_df=df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]]
valid_index=[]
for i in df.index:
if df.loc[i, col] in invalid_list:
continue
valid_index.append(i)
valid_df=df.loc[valid_index]
father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
for i in father_brand_num.index: # i 就是产品父品牌
......@@ -73,7 +83,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
brand_minor = []
brand_list = []
col = '产品品牌'
valid_df = df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]]
valid_index = []
for i in df.index:
if df.loc[i, col] in invalid_list:
continue
valid_index.append(i)
valid_df=df.loc[valid_index]
brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
for i in brand_num.index:
......@@ -88,9 +103,13 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
length_minor_dict = {}
for col in df.columns[7:-2]:
col_length = []
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
for i in valid_index:
valid_index=[]
for i in df.index:
if df.loc[i, col] in invalid_list:
continue
valid_index.append(i)
col_length.append(len(df.loc[i, col]))
if col_length:
std = np.array(col_length).std()
mean = np.array(col_length).mean()
for counter, length in enumerate(col_length):
......@@ -107,7 +126,11 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
format_minor_dict = {}
for col in df.columns[7:-2]:
counter_dict = {}
valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数,需补充') == (df[col].notnull()))]
valid_index = []
for i in df.index:
if df.loc[i, col] in invalid_list:
continue
valid_index.append(i)
for i in valid_index:
counter_list = []
k = df.loc[i, col]
......@@ -161,7 +184,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
for col_i in df.columns[c_list]:
tempo_list = []
tempo_list2 = []
valid_df=df.loc[df.index[(df[col_i] != '暂无数据') == ((df[col_i] != '无参数,需补充') == (df[col_i].notnull()))]]
valid_index = []
for i in df.index:
if df.loc[i, col_i] in invalid_list:
continue
valid_index.append(i)
valid_df = df.loc[valid_index]
cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
for i in cha_num.index:
......
......@@ -13,6 +13,8 @@ import xlsxwriter
def product_washing(filepath, thre=1, a=0):
df_null = pd.read_excel(".\\异常数据表格.xlsx")
invalid_list = df_null['异常数据名称'].values
df=pd.read_excel(filepath, converters={'产品编码':str})
df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for col in df.columns:
......@@ -21,13 +23,13 @@ def product_washing(filepath, thre=1, a=0):
related_product = []
brand_grouped = df.groupby(by='产品品牌')
for brand in brand_grouped:
if brand[0]=='无参数,需补充':
if brand[0] in invalid_list:
continue
tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index:
k = brand[1].loc[i, '产品型号']
if k=='无参数,需补充':
if k in invalid_list:
continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
......@@ -56,6 +58,8 @@ def product_washing(filepath, thre=1, a=0):
pos_num += '+'
combined = pre_num + num + pos_num # 将关键字列表合并
while '' in combined:
combined.remove('')
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
......@@ -74,30 +78,29 @@ def product_washing(filepath, thre=1, a=0):
#
# combined = pre_num + num + pos_num # 将关键字列表合并
# 提取品牌名关键字
temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper()) # 提取汉字
while '新建品牌' in temp_list1:
temp_list1.remove('新建品牌') # 去除‘新建品牌’
temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
brand_combined = temp_list1+temp_list2
# # 提取品牌名关键字
# temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper()) # 提取汉字
# while '新建品牌' in temp_list1:
# temp_list1.remove('新建品牌') # 去除‘新建品牌’
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
while '' in combined:
combined.remove('')
tempo_dict[i] = [set(brand_combined), set(combined)]
tempo_dict[i] = [set(combined), brand[1].loc[i, '*质保时间'], brand[1].loc[i, '标配外服务及配件']]
# 对比产品型号关键字,相同则放入related_product
tested_product=[]
for i in tempo_dict:
for j in tempo_dict:
if i != j and set([i,j]) not in tested_product:
if tempo_dict[i][0]==tempo_dict[j][0]:
if tempo_dict[i][1:]==tempo_dict[j][1:]:
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][1]:
if word_i in tempo_dict[j][1]:
accuracy_i += 1/(len(tempo_dict[i][1]) + a/len(tempo_dict[i][1]))
accuracy_j += 1/(len(tempo_dict[j][1]) + a/len(tempo_dict[j][1]))
for word_i in tempo_dict[i][0]:
if word_i in tempo_dict[j][0]:
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
related_product.append(set([i,j]))
......
......@@ -33,8 +33,8 @@ def pachong_washing(filepath):
if num:
num = num.group(1) # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num=re.findall(r'[A-Za-z]+', k)
num=''.join(num)
alpha=re.findall(r'[A-Za-z]+', k)
alpha=''.join(alpha)
pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
......@@ -55,8 +55,8 @@ def pachong_washing(filepath):
comparing_df.loc[i, '爬取数据pre_num'] = pre_num
if num:
comparing_df.loc[i, '爬取数据num'] = num
# if not num:
# comparing_df.loc[i, '爬取数据alpha'] = alpha
if not num:
comparing_df.loc[i, '爬取数据alpha'] = alpha
comparing_df.loc[i, '爬取数据pos_num'] = pos_num
for i in df.index:
......@@ -73,8 +73,8 @@ def pachong_washing(filepath):
if num:
num = num.group(1) # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
num = ''.join(num)
alpha = re.findall(r'[A-Za-z]+', k)
alpha = ''.join(alpha)
pos_num = re.findall(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
......@@ -106,10 +106,17 @@ def pachong_washing(filepath):
df.loc[i,col] = '暂无数据'
continue
if comparing_df.loc[i,'补充型号num'] != comparing_df.loc[i,'爬取数据num']:
if comparing_df.loc[i,'补充型号num'] != comparing_df.loc[i,'爬取数据num']: # 如果没有num,此处为nan, nan!=nan所以没问题
if type(comparing_df.loc[i,'补充型号num']) != float:
for col in df.columns[15:21]:
df.loc[i, col] = '暂无数据'
continue
else:
if comparing_df.loc[i, '补充型号alpha'] not in comparing_df.loc[i, '爬取数据alpha']:
for col in df.columns[15:21]:
df.loc[i, col] = '暂无数据'
continue
if comparing_df.loc[i,'补充型号pos_num'] != comparing_df.loc[i,'爬取数据pos_num']:
......@@ -117,7 +124,7 @@ def pachong_washing(filepath):
df.loc[i, col] = '暂无数据'
continue
df.to_excel('./after_lijie.xlsx')
df.to_excel('./爬取数据分析.xlsx')
if __name__ == '__main__':
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment