Commit 5f7f3949 authored by Jialin's avatar Jialin

代码更新

parent 62dacfbd
......@@ -33,6 +33,7 @@ def product_washing(filepath, category, thre=1, a=0):
for param in other_parameters_fetch:
other_parameters.append(param[0])
point_category_list = ['台式机', '笔记本', '一体电脑', '复印纸']
related_product = []
brand_grouped = df.groupby(by='产品品牌')
for brand in brand_grouped:
......@@ -41,38 +42,47 @@ def product_washing(filepath, category, thre=1, a=0):
tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index:
k = brand[1].loc[i, '*产品型号']
if k in invalid_list:
continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = [pre_num.group(1)]
if not pre_num:
pre_num = []
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = [num.group(1)] # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
pos_num = re.findall(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = list(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
combined = pre_num + num + pos_num # 将关键字列表合并
while '' in combined:
combined.remove('')
if category in point_category_list:
if category == '复印纸':
k = brand[1].loc[i, '*产品系列']
else:
k = brand[1].loc[i, '*产品子系列']
k.replace(' ','').replace('系列','').replace('_','').replace('-','').upper()
tempo_dict[i] = [k]
else:
k = brand[1].loc[i, '*产品型号']
if k in invalid_list:
continue
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
if pre_num:
pre_num = [pre_num.group(1)]
if not pre_num:
pre_num = []
num = re.search(r'(\d+)', k) # num为数字关键字
if num:
num = [num.group(1)] # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if not num: # 如果没有数字,就比较英文单词
num = re.findall(r'[A-Za-z]+', k)
pos_num = re.findall(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
k) # pos_num为数字后的关键字
if pos_num:
pos_num = list(pos_num[0])
if '升级版' in k:
pos_num += '升级版'
if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
combined = pre_num + num + pos_num # 将关键字列表合并
while '' in combined:
combined.remove('')
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
......@@ -98,7 +108,7 @@ def product_washing(filepath, category, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
tempo_dict[i] = [set(combined)]
tempo_dict[i] = [set(combined)]
other_parameters_values=[]
for parameter in other_parameters:
......@@ -119,16 +129,19 @@ def product_washing(filepath, category, thre=1, a=0):
if count != 0:
tested_product.append(set([i, j]))
break # 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][0]:
if word_i in tempo_dict[j][0]:
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
related_product.append(set([i,j]))
if category in point_category_list:
if tempo_dict[i][0] == tempo_dict[j][0]:
related_product.append(set([i, j]))
else:
accuracy_i=0
accuracy_j=0
for word_i in tempo_dict[i][0]:
if word_i in tempo_dict[j][0]:
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
related_product.append(set([i,j]))
tested_product.append(set([i,j]))
# a = set([i])
......@@ -179,6 +192,6 @@ def product_washing(filepath, category, thre=1, a=0):
if __name__ == '__main__':
filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
category = '激光打印机'
filepath = "E:\\ZDZC\\Sourcetree_local\\公共代码\\一体电脑参数确认.xlsx"
category = '一体电脑'
product_washing(filepath,category)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment