代码更新

5f7f3949 · Jialin · 62dacfbd · 5f7f3949
Commit 5f7f3949 authored May 08, 2021 by Jialin
Hide whitespace changes
Inline Side-by-side

Showing with 58 additions and 45 deletions

产品重复型号分析.py 公共代码/产品重复型号分析.py +58 -45

No files found.
--- a/公共代码/产品重复型号分析.py
+++ b/公共代码/产品重复型号分析.py
@@ -33,6 +33,7 @@ def product_washing(filepath, category, thre=1, a=0):
    for param in other_parameters_fetch:
        other_parameters.append(param[0])
+    point_category_list = ['台式机', '笔记本', '一体电脑', '复印纸']
    related_product = []
    brand_grouped = df.groupby(by='产品品牌')
    for brand in brand_grouped:
@@ -41,38 +42,47 @@ def product_washing(filepath, category, thre=1, a=0):
        tempo_dict = {}
        # 每个品牌提取产品型号关键字，放入tempo_dict
        for i in brand[1].index:
-            k = brand[1].loc[i, '*产品型号']
+            if category in point_category_list:
-            if k in invalid_list:
+                if category == '复印纸':
-                continue
+                    k = brand[1].loc[i, '*产品系列']
-            pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
+                else:
-            if pre_num:
+                    k = brand[1].loc[i, '*产品子系列']
-                pre_num = [pre_num.group(1)]
+                k.replace(' ','').replace('系列','').replace('_','').replace('-','').upper()
-            if not pre_num:
+                tempo_dict[i] = [k]
-                pre_num = []
+            else:
-            num = re.search(r'(\d+)', k)  # num为数字关键字
+                k = brand[1].loc[i, '*产品型号']
-            if num:
+                if k in invalid_list:
-                num = [num.group(1)]  # 如果连续数字超过1处，我们只提取第一处，这个可能有点问题但目前还不知道怎么办
+                    continue
-            if not num:  # 如果没有数字，就比较英文单词
+                pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
-                num = re.findall(r'[A-Za-z]+', k)
+                if pre_num:
+                    pre_num = [pre_num.group(1)]
-            pos_num = re.findall(
+                if not pre_num:
-                r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
+                    pre_num = []
-                k)  # pos_num为数字后的关键字
-            if pos_num:
+                num = re.search(r'(\d+)', k)  # num为数字关键字
-                pos_num = list(pos_num[0])
+                if num:
-                if '升级版' in k:
+                    num = [num.group(1)]  # 如果连续数字超过1处，我们只提取第一处，这个可能有点问题但目前还不知道怎么办
-                    pos_num += '升级版'
+                if not num:  # 如果没有数字，就比较英文单词
-                if '专业版' in k:
+                    num = re.findall(r'[A-Za-z]+', k)
-                    pos_num += '专业版'
-                if '教育版' in k:
+                pos_num = re.findall(
-                    pos_num += '教育版'
+                    r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
-                if '+' in k:
+                    k)  # pos_num为数字后的关键字
-                    pos_num += '+'
+                if pos_num:
+                    pos_num = list(pos_num[0])
-            combined = pre_num + num + pos_num  # 将关键字列表合并
+                    if '升级版' in k:
-            while '' in combined:
+                        pos_num += '升级版'
-                combined.remove('')
+                    if '专业版' in k:
+                        pos_num += '专业版'
+                    if '教育版' in k:
+                        pos_num += '教育版'
+                    if '+' in k:
+                        pos_num += '+'
+                combined = pre_num + num + pos_num  # 将关键字列表合并
+                while '' in combined:
+                    combined.remove('')
            # pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
            # if not pre_num:
            #     pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
@@ -98,7 +108,7 @@ def product_washing(filepath, category, thre=1, a=0):
            # temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper())  # 提取英文单词
            # brand_combined = temp_list1+temp_list2
-            tempo_dict[i] = [set(combined)]
+                tempo_dict[i] = [set(combined)]
            other_parameters_values=[]
            for parameter in other_parameters:
@@ -119,16 +129,19 @@ def product_washing(filepath, category, thre=1, a=0):
                    if count != 0:
                        tested_product.append(set([i, j]))
                        break  # 如果出现了必须相等但不相等的参数，则退出到上一层循环。如果没有出现，则对比型号
+                    if category in point_category_list:
-                    accuracy_i=0
+                        if tempo_dict[i][0] == tempo_dict[j][0]:
-                    accuracy_j=0
+                            related_product.append(set([i, j]))
-                    for word_i in tempo_dict[i][0]:
+                    else:
-                        if word_i in tempo_dict[j][0]:
+                        accuracy_i=0
-                            accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
+                        accuracy_j=0
-                            accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
+                        for word_i in tempo_dict[i][0]:
-                    if accuracy_i >= thre or accuracy_j >= thre:
+                            if word_i in tempo_dict[j][0]:
-                        if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
+                                accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
-                            related_product.append(set([i,j]))
+                                accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
+                        if accuracy_i >= thre or accuracy_j >= thre:
+                            if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
+                                related_product.append(set([i,j]))
                    tested_product.append(set([i,j]))
            # a = set([i])
@@ -179,6 +192,6 @@ def product_washing(filepath, category, thre=1, a=0):
 if __name__ == '__main__':
-    filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
+    filepath = "E:\\ZDZC\\Sourcetree_local\\公共代码\\一体电脑参数确认.xlsx"
-    category = '激光打印机'
+    category = '一体电脑'
    product_washing(filepath,category)