代码最终修改

c040d93d · Jialin · d54af336 · c040d93d · c040d93d · c040d93d
Commit c040d93d authored Apr 14, 2021 by Jialin
5 changed files
--- a/公共代码/产品品牌分析.py
+++ b/公共代码/产品品牌分析.py
@@ -5,6 +5,7 @@
 import pandas as pd
 import re
 import xlsxwriter
+import numpy as np


 def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # 处理缺失值
    valid_index=[]
    for i in df.index:
-        if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'产品型号'] not in invalid_list:
+        if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'*产品型号'] not in invalid_list:
            valid_index.append(i)
    result = df.loc[valid_index]
    # 将df数据格式转为字符串
@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # 将df按品牌分类，按品牌提取每个型号的关键字，放入字典
    brand_type = result.groupby('产品品牌')
    for brand in brand_type:
-        result_unique = brand[1]['产品型号'].unique()  # result_unique此时是array，元素是一个品牌名下的型号
+        result_unique = brand[1]['*产品型号'].unique()  # result_unique此时是array，元素是一个品牌名下的型号
        for j in range(len(result_unique)):
            result_unique[j] = result_unique[j].upper().strip()
        result_unique = pd.DataFrame(result_unique)[0].unique()  # 全部变为大写，将大小写归一,result_unique此时是array
@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # 第三种方法，对比品牌名和另一个品牌型号关键字，如果有一个重合，就算作两个品牌相似
    type_kw = {}  # 用于接收品牌型号提取的关键字
    for i in brand_type:
-        result_unique = i[1]['产品型号'].unique()  # 品牌型号组成的数组，数组内无重复元素
+        result_unique = i[1]['*产品型号'].unique()  # 品牌型号组成的数组，数组内无重复元素
        for k in range(len(result_unique)):
            result_unique[k]=result_unique[k].upper().strip()
        result_unique = pd.DataFrame(result_unique)[0].unique()
@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
                related_brand3.append(tempo_list)

    #写入excel
+    tempo_list = []  # 将所有的相似品牌，两两一对，写入集合，放入tempo_list
+    method1=[]
+    method2=[]
+    method3=[]
+    related_brand_list=[related_brand1,related_brand2,related_brand3]
+    method_list=[method1,method2,method3]
+    for i in range(len(related_brand_list)):
+        for list_i in related_brand_list[i]:
+            tempo_list.append(set(list_i[:2]))
+            method_list[i].append(set(list_i[:2]))
+
+
+    final_list = []  # final_list就是tempo_list的去重
+    for item in tempo_list:
+        if item not in final_list:
+            final_list.append(item)
+
+    method=[]
+    for item in final_list:
+        linshi_list=[]
+        if item in method1:
+            linshi_list.append('1')
+        if item in method2:
+            linshi_list.append('2')
+        if item in method3:
+            linshi_list.append('3')
+        method.append(','.join(linshi_list))
+
+
+
    workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
    bold_format = workbook.add_format({'bold': True})

@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    worksheet.write('D1', '方法', bold_format)
    col = 0
    row = 1
-    for list_i in related_brand1:
-        for brand_i in range(2):
-            worksheet.write_string(row, col, list_i[brand_i])
+    for index in range(len(final_list)):
+        for brand_i in final_list[index]:
+            worksheet.write_string(row, col, brand_i)
            col += 1
        col= 3
-        worksheet.write_string(row,col,'1')
+        worksheet.write_string(row,col,method[index])
        row += 1
        col = 0

-    for list_i in related_brand2:
-        for brand_i in range(2):
-            worksheet.write_string(row, col, list_i[brand_i])
-            col += 1
-        col = 3
-        worksheet.write_string(row, col, '2')
-        row += 1
-        col = 0
-
-    for list_i in related_brand3:
-        for brand_i in range(2):
-            worksheet.write_string(row, col, list_i[brand_i])
-            col += 1
-        col = 3
-        worksheet.write_string(row, col, '3')
-        row += 1
-        col = 0
+    # for list_i in related_brand2:
+    #     for brand_i in range(2):
+    #         worksheet.write_string(row, col, list_i[brand_i])
+    #         col += 1
+    #     col = 3
+    #     worksheet.write_string(row, col, '2')
+    #     row += 1
+    #     col = 0
+    #
+    # for list_i in related_brand3:
+    #     for brand_i in range(2):
+    #         worksheet.write_string(row, col, list_i[brand_i])
+    #         col += 1
+    #     col = 3
+    #     worksheet.write_string(row, col, '3')
+    #     row += 1
+    #     col = 0

    # 第二个worksheet
    worksheet2 = workbook.add_worksheet(name='Sheet2')
@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    workbook.close()

 if __name__ == '__main__':
-    filepath = 'E:\\ZDZC\\激光打印机参数确认.xlsx'
+    filepath = 'E:\\ZDZC\\扫描仪参数确认.xlsx'
    brand_washing(filepath)

--- a/公共代码/产品类别分析.py
+++ b/公共代码/产品类别分析.py
@@ -9,20 +9,25 @@
 import pandas as pd
 import re
 import numpy as np
+import pymssql
+import time

 # category为产品类型
 # a是同一品牌或父品牌下产品数量占产品总数量的百分比，作为阈值，a越大，有异常的产品越多；b和a一样，只是用于产品数据类型和参数
 # c_list是产品参数中，数据类型较为统一的参数 在excel列名中的位置，从0开始，必须是一个list


-def class_washing(category, filepath, c_list,a=0.02, b=0.01):
+def class_washing(category, filepath, b=0.01):
    df_null=pd.read_excel(".\\异常数据表格.xlsx")
    invalid_list = df_null['异常数据名称'].values
    df = pd.read_excel(filepath,converters = {'产品编码':str})
    df.drop(columns='Unnamed: 0', axis=1, inplace=True)
+
    # 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
+    t1=time.time()
+    print('开始检测数据类型')
    dtype_minor_dict = {}
-    for col in df.columns:
+    for col in df_null['数据类型异常'][df_null['类别']==category][df_null['数据类型异常'].notnull()].values:
        type_list = {}
        valid_index = []
        for i in df.index:
@@ -51,6 +56,9 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
        df[col] = df[col].astype(str)

    # 检测产品类型错误的产品，和产品名称中不带有产品类型的产品。由于代码简单，就放在一起了
+    t2=time.time()
+    print(t2-t1)
+    print('开始检测错误类别和错误名称')
    wrong_class = []
    not_in_name = []
    for i in df.index:
@@ -59,49 +67,93 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
        if category not in df.loc[i, '产品名称']:
            not_in_name.append(i)

-    # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
-    father_brand_minor = []
-    father_brand_list = []
-    col='产品父品牌'
-    valid_index=[]
-    for i in df.index:
-        if df.loc[i, col] in invalid_list:
-            continue
-        valid_index.append(i)
-    valid_df=df.loc[valid_index]
-    father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count()  # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
-    father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
-    for i in father_brand_num.index:  # i 就是产品父品牌
-        if father_brand_num.loc[i] in father_num_list:  # father_brand_num.loc[i] 就是该父品牌出现次数
-            father_brand_list.append(i)
-
-    for i in valid_df.index:
-        if valid_df.loc[i, '产品父品牌'] in father_brand_list:
-            father_brand_minor.append(i)
-
-    # 检测产品品牌中品牌出现次数少的产品
-    brand_minor = []
-    brand_list = []
-    col = '产品品牌'
-    valid_index = []
+    # 检测品牌中是否有不在category下对应的brand_id的产品品牌
+    t3=time.time()
+    print(t3-t2)
+    print('开始检测错误品牌')
+    conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
+                                  database='ZI_NEW', autocommit=True)
+    cursor_zi_new = conn_zi_new.cursor()
+    cursor_zi_new.execute(f"select id from p_category where name='{category}'")
+    category_id=cursor_zi_new.fetchone()
+    if not category_id:
+        print('输入类别不在数据库中，请查证')
+        return
+    category_id = category_id[0]
+
+    cursor_zi_new.execute(f"select brandid from p_spu where categoryid={category_id}")
+    brand_id_fetchall=cursor_zi_new.fetchall()
+    brand_id_list = []
+    for brand_tuple in brand_id_fetchall:
+        brand_id_list.append(brand_tuple[0])
+
+    brand_name_list = []
+    for brand_id in brand_id_list:
+        cursor_zi_new.execute(f"select name from p_brand where id={brand_id}")
+        brand_name_fetch=cursor_zi_new.fetchone()
+        if brand_name_fetch:
+            brand_name_list.append(brand_name_fetch[0].strip("'"))
+
+    wrong_brand=[]
    for i in df.index:
-        if df.loc[i, col] in invalid_list:
-            continue
-        valid_index.append(i)
-    valid_df=df.loc[valid_index]
-    brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count()  # 同上
-    num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
-    for i in brand_num.index:
-        if brand_num.loc[i] in num_list:
-            brand_list.append(i)
-
-    for i in valid_df.index:
-        if valid_df.loc[i, '产品品牌'] in brand_list:
-            brand_minor.append(i)
+        if df.loc[i]['产品品牌'] not in brand_name_list:
+            wrong_brand.append(i)
+        # cursor_zi_new.execute(f"select id from p_brand where name='{df.loc[i]['产品品牌']}'")
+        # brand_id=cursor_zi_new.fetchone()
+        # if not brand_id:
+        #     wrong_brand.append(i)
+        #     continue
+        # brand_id=brand_id[0]
+        # if brand_id not in brand_id_list:
+        #     wrong_brand.append(i)
+
+
+    # # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
+    # father_brand_minor = []
+    # father_brand_list = []
+    # col='产品父品牌'
+    # valid_index=[]
+    # for i in df.index:
+    #     if df.loc[i, col] in invalid_list:
+    #         continue
+    #     valid_index.append(i)
+    # valid_df=df.loc[valid_index]
+    # father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count()  # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
+    # father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
+    # for i in father_brand_num.index:  # i 就是产品父品牌
+    #     if father_brand_num.loc[i] in father_num_list:  # father_brand_num.loc[i] 就是该父品牌出现次数
+    #         father_brand_list.append(i)
+    #
+    # for i in valid_df.index:
+    #     if valid_df.loc[i, '产品父品牌'] in father_brand_list:
+    #         father_brand_minor.append(i)
+    #
+    # # 检测产品品牌中品牌出现次数少的产品
+    # brand_minor = []
+    # brand_list = []
+    # col = '产品品牌'
+    # valid_index = []
+    # for i in df.index:
+    #     if df.loc[i, col] in invalid_list:
+    #         continue
+    #     valid_index.append(i)
+    # valid_df=df.loc[valid_index]
+    # brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count()  # 同上
+    # num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
+    # for i in brand_num.index:
+    #     if brand_num.loc[i] in num_list:
+    #         brand_list.append(i)
+    #
+    # for i in valid_df.index:
+    #     if valid_df.loc[i, '产品品牌'] in brand_list:
+    #         brand_minor.append(i)

    # 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品，7是第一个产品参数列，-2是质保时间，-1是产品型号
+    t4=time.time()
+    print(t4-t3)
+    print('开始检测错误长度')
    length_minor_dict = {}
-    for col in df.columns[7:-2]:
+    for col in df_null['数据长度异常'][df_null['类别']==category][df_null['数据长度异常'].notnull()].values:
        col_length = []
        valid_index=[]
        for i in df.index:
@@ -122,9 +174,13 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
                        length_minor_dict[index] = col
    length_minor=[]
    length_minor.extend(length_minor_dict.keys())
+
    # 检测产品参数列数据格式小于总数量的b的产品
+    t5=time.time()
+    print(t5-t4)
+    print('开始检测错误数据格式')
    format_minor_dict = {}
-    for col in df.columns[7:-2]:
+    for col in df_null['数据格式异常'][df_null['类别']==category][df_null['数据格式异常'].notnull()].values:
        counter_dict = {}
        valid_index = []
        for i in df.index:
@@ -160,9 +216,8 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
                    elif keys_index not in format_minor_dict.keys():
                        format_minor_dict[keys_index] = col

-
-        format_minor=[]
-        format_minor.extend(format_minor_dict.keys())
+    format_minor=[]
+    format_minor.extend(format_minor_dict.keys())
        # length_record = []
        # for keys in counter_dict:
        #     if not length_record:
@@ -172,79 +227,114 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
        #
        # format_minor += length_record[0][1]

-    # 接下来是针对扫描仪的部分.对于特定产品，还可以从每个产品参数中选出少数派。如果知道易混淆的产品类型，还要特意加上挑取易混淆产品类型的代码
    # 对于产品名称中没有“扫描仪”的，如果没有“高拍仪”就挑出来
    not_in_name2 = []
    for i in not_in_name:
-        if '高拍仪' not in df.loc[i,'产品名称']:
-            not_in_name2.append(i)
+        for special_name in df_null['产品名称异常'][df_null['类别']==category][df_null['产品名称异常'].notnull()].values:
+            if special_name in df.loc[i,'产品名称']:
+                break
+        not_in_name2.append(i)

-    # 对于产品参数中，数据类型较少的参数，其中如果有数量小于产品总数量的b的，挑出来
+    # 对于标准产品参数中，如果有数据不在标准字典中的，挑出来
+    t6=time.time()
+    print(t6-t5)
+    print('开始检测标准参数')
    character_minor_dict = {}
-    for col_i in df.columns[c_list]:
-        tempo_list = []
-        tempo_list2 = []
-        valid_index = []
+    for col_i in df_null['标准参数异常'][df_null['类别']==category][df_null['标准参数异常'].notnull()].values:
+        temp_list = []
+        cursor_zi_new.execute(f"select stdvalue from ShuJuZiDian_Cfg where categoryname='{category}' and subtitle='{col_i.strip('*')}'")
+        standard_value_fetchall=cursor_zi_new.fetchall()
+        if not standard_value_fetchall:
+            print(f"{col_i.strip('*')} 不在 ShuJuZiDian_Cfg,请检查。该参数项在此次运行中未被采用")
+            continue
+
+        standard_value=[]
+        for std_tuple in standard_value_fetchall:
+            standard_value.append(std_tuple[0])
+
        for i in df.index:
-            if df.loc[i, col_i] in invalid_list:
+            value_col=df.loc[i, col_i]
+            if value_col in invalid_list:
                continue
-            valid_index.append(i)
-        valid_df = df.loc[valid_index]
-        cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
-        num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
-        for i in cha_num.index:
-            if cha_num.loc[i] in num_list:
-                tempo_list.append(i)
-        for i in valid_df.index:
-            if valid_df.loc[i, col_i] in tempo_list:
-                tempo_list2.append(i)
-        character_minor_dict[col_i] = tempo_list2
+            if value_col not in standard_value:
+                temp_list.append(i)
+
+        character_minor_dict[col_i] = temp_list

    character_minor = []
    for keys_i in character_minor_dict:
        character_minor.extend(character_minor_dict[keys_i])

-    # 将挑出的可疑数据整合到一起 (wrong_class没加，因为里面的肯定不对）
+
+
+
+
+    #     cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
+    #     num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
+    #     for i in cha_num.index:
+    #         if cha_num.loc[i] in num_list:
+    #             tempo_list.append(i)
+    #     for i in valid_df.index:
+    #         if valid_df.loc[i, col_i] in tempo_list:
+    #             tempo_list2.append(i)
+    #     character_minor_dict[col_i] = tempo_list2
+    #
+    # character_minor = []
+    # for keys_i in character_minor_dict:
+    #     character_minor.extend(character_minor_dict[keys_i])
+
+    t7=time.time()
+    print(t7-t6)
+    print('开始整合数据')
+    # 将挑出的可疑数据整合到一起
    index_minor = []
    index_minor.extend(wrong_class)
    index_minor.extend(format_minor)
    index_minor.extend(length_minor)
-    index_minor.extend(brand_minor)
-    index_minor.extend(father_brand_minor)
+    index_minor.extend(wrong_brand)
+    # index_minor.extend(father_brand_minor)
    index_minor.extend(not_in_name2)
    index_minor.extend(dtype_minor)
    index_minor.extend(character_minor)
    index_minor = set(index_minor)

-    final_df = pd.DataFrame(np.zeros((len(index_minor), 8)), index=list(index_minor),
-                            columns=['计数', '产品类型异常', '产品名称异常', '父品牌异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
+    final_df = pd.DataFrame(np.zeros((len(index_minor), 7)), index=list(index_minor),
+                            columns=['计数', '产品类别异常', '产品名称异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
+
+    w_class = df_null['产品类别异常权重'][df_null['类别']==category][df_null['产品类别异常权重'].notnull()].values
+    w_format = df_null['数据格式异常权重'][df_null['类别']==category][df_null['数据格式异常权重'].notnull()].values
+    w_length = df_null['数据长度异常权重'][df_null['类别']==category][df_null['数据长度异常权重'].notnull()].values
+    w_brand = df_null['品牌异常权重'][df_null['类别']==category][df_null['品牌异常权重'].notnull()].values
+    w_name = df_null['产品名称异常权重'][df_null['类别']==category][df_null['产品名称异常权重'].notnull()].values
+    w_dtype = df_null['数据类型异常权重'][df_null['类别']==category][df_null['数据类型异常权重'].notnull()].values
+    w_stdparam = df_null['标准参数异常权重'][df_null['类别']==category][df_null['标准参数异常权重'].notnull()].values
    for i in index_minor:
        count = 0
        if i in wrong_class:
-            count += 1
-            final_df.loc[i, '产品类型异常'] = 1
+            count += w_class
+            final_df.loc[i, '产品类别异常'] = 1
        if i in format_minor:
-            count += len(format_minor_dict[i].split())  # 如果该行数据有多列数据格式异常，就要加多次,
+            count += len(format_minor_dict[i].split()) * w_format  # 如果该行数据有多列数据格式异常，就要加多次,
            final_df.loc[i, '数据格式异常'] = format_minor_dict[i]  # 但其中数据是空格分割的字符串，所以用split
        if i in length_minor:
-            count += len(length_minor_dict[i].split())
+            count += len(length_minor_dict[i].split()) * w_length
            final_df.loc[i, '数据长度异常'] = length_minor_dict[i]
-        if i in brand_minor:
-            count += 1
+        if i in wrong_brand:
+            count += w_brand
            final_df.loc[i, '品牌异常'] = 1
-        if i in father_brand_minor:
-            count += 1
-            final_df.loc[i, '父品牌异常'] = 1
+        # if i in father_brand_minor:
+        #     count += 1
+        #     final_df.loc[i, '父品牌异常'] = 1
        if i in not_in_name2:
-            count += 1
+            count += w_name
            final_df.loc[i, '产品名称异常'] = 1
        if i in dtype_minor:
-            count += len(dtype_minor_dict[i].split())
+            count += len(dtype_minor_dict[i].split()) * w_dtype
            final_df.loc[i, '数据类型异常'] = dtype_minor_dict[i]
        for keys_i in character_minor_dict:
            if i in character_minor_dict[keys_i]:
                final_df.loc[i, keys_i + '异常'] = 1
-                count += 1
+                count += w_stdparam
            else:
                final_df.loc[i, keys_i + '异常'] = 0
        final_df.loc[i, '计数'] = count
@@ -255,10 +345,10 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):


 if __name__ == '__main__':
-    category='激光打印机'
-    filepath="E:\\ZDZC\\激光打印机参数确认.xlsx"
-    c_list=[6,7,-4,-3]
+    category='扫描仪'
+    filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
+    #c_list=[6,7,-4,-3]
    # category = '扫描仪'
    # filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
    # c_list=[7,8,9]
-    class_washing(category, filepath, c_list)
+    class_washing(category, filepath)
--- a/公共代码/产品重复型号分析.py
+++ b/公共代码/产品重复型号分析.py
@@ -12,7 +12,7 @@ import numpy as np
 import xlsxwriter


-def product_washing(filepath, thre=1, a=0):
+def product_washing(filepath, category,thre=1, a=0):
    df_null = pd.read_excel(".\\异常数据表格.xlsx")
    invalid_list = df_null['异常数据名称'].values
    df=pd.read_excel(filepath, converters={'产品编码':str})
@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
        tempo_dict = {}
        # 每个品牌提取产品型号关键字，放入tempo_dict
        for i in brand[1].index:
-            k = brand[1].loc[i, '产品型号']
+            k = brand[1].loc[i, '*产品型号']
            if k in invalid_list:
                continue
            pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
@@ -85,25 +85,34 @@ def product_washing(filepath, thre=1, a=0):
            # temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper())  # 提取英文单词
            # brand_combined = temp_list1+temp_list2

+            tempo_dict[i] = [set(combined)]
+            other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
+            other_parameters_values=[]
+            for parameter in other_parameters:
+                other_parameters_values.append(brand[1].loc[i,parameter])
+            tempo_dict[i].extend(other_parameters_values)


-            tempo_dict[i] = [set(combined), brand[1].loc[i, '*质保时间'], brand[1].loc[i, '标配外服务及配件']]

-        # 对比产品型号关键字，相同则放入related_product
+        # 对比产品参数项，相同则放入related_product
        tested_product=[]
        for i in tempo_dict:
            for j in tempo_dict:
                if i != j and set([i,j]) not in tested_product:
-                    if tempo_dict[i][1:]==tempo_dict[j][1:]:
-                        accuracy_i=0
-                        accuracy_j=0
-                        for word_i in tempo_dict[i][0]:
-                            if word_i in tempo_dict[j][0]:
-                                accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
-                                accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
-                        if accuracy_i >= thre or accuracy_j >= thre:
-                            if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
-                                related_product.append(set([i,j]))
+                    for index in range(1,len(tempo_dict[i])):
+                        if tempo_dict[i][index]!= tempo_dict[j][index] and \
+                                (tempo_dict[i][index] not in invalid_list and tempo_dict[j][index] not in invalid_list):
+                            tested_product.append(set([i, j]))
+                            break  # 如果出现了必须相等但不相等的参数，则退出到上一层循环。如果没有出现，则对比型号
+                    accuracy_i=0
+                    accuracy_j=0
+                    for word_i in tempo_dict[i][0]:
+                        if word_i in tempo_dict[j][0]:
+                            accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
+                            accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
+                    if accuracy_i >= thre or accuracy_j >= thre:
+                        if not (df.loc[i,'*产品型号'].endswith('+') ^ df.loc[j,'*产品型号'].endswith('+')):
+                            related_product.append(set([i,j]))
                    tested_product.append(set([i,j]))

            # a = set([i])
@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):


 if __name__ == '__main__':
-    filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
-    product_washing(filepath)
+    filepath = "E:\\ZDZC\\扫描仪参数确认.xlsx"
+    category = '扫描仪'
+    product_washing(filepath,category)
--- a/公共代码/异常数据表格.xlsx
+++ b/公共代码/异常数据表格.xlsx
--- a/公共代码/激光打印机参数确认.xlsx
+++ b/公共代码/激光打印机参数确认.xlsx