代码最终修改

c040d93d · Jialin · d54af336 · c040d93d · c040d93d · c040d93d
Commit c040d93d authored Apr 14, 2021 by Jialin
5 changed files
--- a/公共代码/产品品牌分析.py
+++ b/公共代码/产品品牌分析.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import re
 import xlsxwriter
+import numpy as np
 def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # 处理缺失值
    valid_index=[]
    for i in df.index:
-        if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'产品型号'] not in invalid_list:
+        if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'*产品型号'] not in invalid_list:
            valid_index.append(i)
    result = df.loc[valid_index]
    # 将df数据格式转为字符串
@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # 将df按品牌分类，按品牌提取每个型号的关键字，放入字典
    brand_type = result.groupby('产品品牌')
    for brand in brand_type:
-        result_unique = brand[1]['产品型号'].unique()  # result_unique此时是array，元素是一个品牌名下的型号
+        result_unique = brand[1]['*产品型号'].unique()  # result_unique此时是array，元素是一个品牌名下的型号
        for j in range(len(result_unique)):
            result_unique[j] = result_unique[j].upper().strip()
        result_unique = pd.DataFrame(result_unique)[0].unique()  # 全部变为大写，将大小写归一,result_unique此时是array
@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # 第三种方法，对比品牌名和另一个品牌型号关键字，如果有一个重合，就算作两个品牌相似
    type_kw = {}  # 用于接收品牌型号提取的关键字
    for i in brand_type:
-        result_unique = i[1]['产品型号'].unique()  # 品牌型号组成的数组，数组内无重复元素
+        result_unique = i[1]['*产品型号'].unique()  # 品牌型号组成的数组，数组内无重复元素
        for k in range(len(result_unique)):
            result_unique[k]=result_unique[k].upper().strip()
        result_unique = pd.DataFrame(result_unique)[0].unique()
@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
                related_brand3.append(tempo_list)
    #写入excel
+    tempo_list = []  # 将所有的相似品牌，两两一对，写入集合，放入tempo_list
+    method1=[]
+    method2=[]
+    method3=[]
+    related_brand_list=[related_brand1,related_brand2,related_brand3]
+    method_list=[method1,method2,method3]
+    for i in range(len(related_brand_list)):
+        for list_i in related_brand_list[i]:
+            tempo_list.append(set(list_i[:2]))
+            method_list[i].append(set(list_i[:2]))
+    final_list = []  # final_list就是tempo_list的去重
+    for item in tempo_list:
+        if item not in final_list:
+            final_list.append(item)
+    method=[]
+    for item in final_list:
+        linshi_list=[]
+        if item in method1:
+            linshi_list.append('1')
+        if item in method2:
+            linshi_list.append('2')
+        if item in method3:
+            linshi_list.append('3')
+        method.append(','.join(linshi_list))
    workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
    bold_format = workbook.add_format({'bold': True})
@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    worksheet.write('D1', '方法', bold_format)
    col = 0
    row = 1
-    for list_i in related_brand1:
+    for index in range(len(final_list)):
-        for brand_i in range(2):
+        for brand_i in final_list[index]:
-            worksheet.write_string(row, col, list_i[brand_i])
+            worksheet.write_string(row, col, brand_i)
            col += 1
        col= 3
-        worksheet.write_string(row,col,'1')
+        worksheet.write_string(row,col,method[index])
        row += 1
        col = 0
-    for list_i in related_brand2:
+    # for list_i in related_brand2:
-        for brand_i in range(2):
+    #     for brand_i in range(2):
-            worksheet.write_string(row, col, list_i[brand_i])
+    #         worksheet.write_string(row, col, list_i[brand_i])
-            col += 1
+    #         col += 1
-        col = 3
+    #     col = 3
-        worksheet.write_string(row, col, '2')
+    #     worksheet.write_string(row, col, '2')
-        row += 1
+    #     row += 1
-        col = 0
+    #     col = 0
+    #
-    for list_i in related_brand3:
+    # for list_i in related_brand3:
-        for brand_i in range(2):
+    #     for brand_i in range(2):
-            worksheet.write_string(row, col, list_i[brand_i])
+    #         worksheet.write_string(row, col, list_i[brand_i])
-            col += 1
+    #         col += 1
-        col = 3
+    #     col = 3
-        worksheet.write_string(row, col, '3')
+    #     worksheet.write_string(row, col, '3')
-        row += 1
+    #     row += 1
-        col = 0
+    #     col = 0
    # 第二个worksheet
    worksheet2 = workbook.add_worksheet(name='Sheet2')
@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    workbook.close()
 if __name__ == '__main__':
-    filepath = 'E:\\ZDZC\\激光打印机参数确认.xlsx'
+    filepath = 'E:\\ZDZC\\扫描仪参数确认.xlsx'
    brand_washing(filepath)
--- a/公共代码/产品类别分析.py
+++ b/公共代码/产品类别分析.py
--- a/公共代码/产品重复型号分析.py
+++ b/公共代码/产品重复型号分析.py
@@ -12,7 +12,7 @@ import numpy as np
 import xlsxwriter
-def product_washing(filepath, thre=1, a=0):
+def product_washing(filepath, category,thre=1, a=0):
    df_null = pd.read_excel(".\\异常数据表格.xlsx")
    invalid_list = df_null['异常数据名称'].values
    df=pd.read_excel(filepath, converters={'产品编码':str})
@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
        tempo_dict = {}
        # 每个品牌提取产品型号关键字，放入tempo_dict
        for i in brand[1].index:
-            k = brand[1].loc[i, '产品型号']
+            k = brand[1].loc[i, '*产品型号']
            if k in invalid_list:
                continue
            pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
@@ -85,25 +85,34 @@ def product_washing(filepath, thre=1, a=0):
            # temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper())  # 提取英文单词
            # brand_combined = temp_list1+temp_list2
+            tempo_dict[i] = [set(combined)]
+            other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
+            other_parameters_values=[]
+            for parameter in other_parameters:
+                other_parameters_values.append(brand[1].loc[i,parameter])
+            tempo_dict[i].extend(other_parameters_values)
-            tempo_dict[i] = [set(combined), brand[1].loc[i, '*质保时间'], brand[1].loc[i, '标配外服务及配件']]
-        # 对比产品型号关键字，相同则放入related_product
+        # 对比产品参数项，相同则放入related_product
        tested_product=[]
        for i in tempo_dict:
            for j in tempo_dict:
                if i != j and set([i,j]) not in tested_product:
-                    if tempo_dict[i][1:]==tempo_dict[j][1:]:
+                    for index in range(1,len(tempo_dict[i])):
-                        accuracy_i=0
+                        if tempo_dict[i][index]!= tempo_dict[j][index] and \
-                        accuracy_j=0
+                                (tempo_dict[i][index] not in invalid_list and tempo_dict[j][index] not in invalid_list):
-                        for word_i in tempo_dict[i][0]:
+                            tested_product.append(set([i, j]))
-                            if word_i in tempo_dict[j][0]:
+                            break  # 如果出现了必须相等但不相等的参数，则退出到上一层循环。如果没有出现，则对比型号
-                                accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
+                    accuracy_i=0
-                                accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
+                    accuracy_j=0
-                        if accuracy_i >= thre or accuracy_j >= thre:
+                    for word_i in tempo_dict[i][0]:
-                            if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
+                        if word_i in tempo_dict[j][0]:
-                                related_product.append(set([i,j]))
+                            accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
+                            accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
+                    if accuracy_i >= thre or accuracy_j >= thre:
+                        if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
+                            related_product.append(set([i,j]))
                    tested_product.append(set([i,j]))
            # a = set([i])
@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):
 if __name__ == '__main__':
-    filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
+    filepath = "E:\\ZDZC\\扫描仪参数确认.xlsx"
-    product_washing(filepath)
+    category = '扫描仪'
+    product_washing(filepath,category)
--- a/公共代码/异常数据表格.xlsx
+++ b/公共代码/异常数据表格.xlsx
--- a/公共代码/激光打印机参数确认.xlsx
+++ b/公共代码/激光打印机参数确认.xlsx