Commit 5f7f3949 authored by Jialin's avatar Jialin

代码更新

parent 62dacfbd
...@@ -33,6 +33,7 @@ def product_washing(filepath, category, thre=1, a=0): ...@@ -33,6 +33,7 @@ def product_washing(filepath, category, thre=1, a=0):
for param in other_parameters_fetch: for param in other_parameters_fetch:
other_parameters.append(param[0]) other_parameters.append(param[0])
point_category_list = ['台式机', '笔记本', '一体电脑', '复印纸']
related_product = [] related_product = []
brand_grouped = df.groupby(by='产品品牌') brand_grouped = df.groupby(by='产品品牌')
for brand in brand_grouped: for brand in brand_grouped:
...@@ -41,38 +42,47 @@ def product_washing(filepath, category, thre=1, a=0): ...@@ -41,38 +42,47 @@ def product_washing(filepath, category, thre=1, a=0):
tempo_dict = {} tempo_dict = {}
# 每个品牌提取产品型号关键字,放入tempo_dict # 每个品牌提取产品型号关键字,放入tempo_dict
for i in brand[1].index: for i in brand[1].index:
k = brand[1].loc[i, '*产品型号'] if category in point_category_list:
if k in invalid_list: if category == '复印纸':
continue k = brand[1].loc[i, '*产品系列']
pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字 else:
if pre_num: k = brand[1].loc[i, '*产品子系列']
pre_num = [pre_num.group(1)] k.replace(' ','').replace('系列','').replace('_','').replace('-','').upper()
if not pre_num: tempo_dict[i] = [k]
pre_num = []
else:
num = re.search(r'(\d+)', k) # num为数字关键字 k = brand[1].loc[i, '*产品型号']
if num: if k in invalid_list:
num = [num.group(1)] # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办 continue
if not num: # 如果没有数字,就比较英文单词 pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
num = re.findall(r'[A-Za-z]+', k) if pre_num:
pre_num = [pre_num.group(1)]
pos_num = re.findall( if not pre_num:
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?', pre_num = []
k) # pos_num为数字后的关键字
if pos_num: num = re.search(r'(\d+)', k) # num为数字关键字
pos_num = list(pos_num[0]) if num:
if '升级版' in k: num = [num.group(1)] # 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
pos_num += '升级版' if not num: # 如果没有数字,就比较英文单词
if '专业版' in k: num = re.findall(r'[A-Za-z]+', k)
pos_num += '专业版'
if '教育版' in k: pos_num = re.findall(
pos_num += '教育版' r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
if '+' in k: k) # pos_num为数字后的关键字
pos_num += '+' if pos_num:
pos_num = list(pos_num[0])
combined = pre_num + num + pos_num # 将关键字列表合并 if '升级版' in k:
while '' in combined: pos_num += '升级版'
combined.remove('') if '专业版' in k:
pos_num += '专业版'
if '教育版' in k:
pos_num += '教育版'
if '+' in k:
pos_num += '+'
combined = pre_num + num + pos_num # 将关键字列表合并
while '' in combined:
combined.remove('')
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字 # pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num: # if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k) # pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
...@@ -98,7 +108,7 @@ def product_washing(filepath, category, thre=1, a=0): ...@@ -98,7 +108,7 @@ def product_washing(filepath, category, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词 # temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2 # brand_combined = temp_list1+temp_list2
tempo_dict[i] = [set(combined)] tempo_dict[i] = [set(combined)]
other_parameters_values=[] other_parameters_values=[]
for parameter in other_parameters: for parameter in other_parameters:
...@@ -119,16 +129,19 @@ def product_washing(filepath, category, thre=1, a=0): ...@@ -119,16 +129,19 @@ def product_washing(filepath, category, thre=1, a=0):
if count != 0: if count != 0:
tested_product.append(set([i, j])) tested_product.append(set([i, j]))
break # 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号 break # 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
if category in point_category_list:
accuracy_i=0 if tempo_dict[i][0] == tempo_dict[j][0]:
accuracy_j=0 related_product.append(set([i, j]))
for word_i in tempo_dict[i][0]: else:
if word_i in tempo_dict[j][0]: accuracy_i=0
accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0])) accuracy_j=0
accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0])) for word_i in tempo_dict[i][0]:
if accuracy_i >= thre or accuracy_j >= thre: if word_i in tempo_dict[j][0]:
if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')): accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
related_product.append(set([i,j])) accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
if accuracy_i >= thre or accuracy_j >= thre:
if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
related_product.append(set([i,j]))
tested_product.append(set([i,j])) tested_product.append(set([i,j]))
# a = set([i]) # a = set([i])
...@@ -179,6 +192,6 @@ def product_washing(filepath, category, thre=1, a=0): ...@@ -179,6 +192,6 @@ def product_washing(filepath, category, thre=1, a=0):
if __name__ == '__main__': if __name__ == '__main__':
filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx" filepath = "E:\\ZDZC\\Sourcetree_local\\公共代码\\一体电脑参数确认.xlsx"
category = '激光打印机' category = '一体电脑'
product_washing(filepath,category) product_washing(filepath,category)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment