真·代码最终修改

4521e64e · Jialin · cac7d04a · 4521e64e · 4521e64e · 4521e64e
Commit 4521e64e authored Apr 14, 2021 by Jialin
8 changed files
--- a/公共代码/brand_filter.xlsx
+++ b/公共代码/brand_filter.xlsx
--- a/公共代码/class_filter.xlsx
+++ b/公共代码/class_filter.xlsx
--- a/公共代码/product_filter.xlsx
+++ b/公共代码/product_filter.xlsx
--- a/公共代码/产品品牌分析.py
+++ b/公共代码/产品品牌分析.py
@@ -6,13 +6,20 @@ import pandas as pd
 import re
 import xlsxwriter
 import numpy as np
+import pymssql


 def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # filepath:文件路径，thre为两个品牌下型号重合率阈值，inner_thre为两个品牌下某条型号内关键词重合率阈值，a为权重调整，sheet_name为表单名
    df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str})
-    df_null = pd.read_excel(".\\异常数据表格.xlsx")
-    invalid_list = df_null['异常数据名称'].values
+    conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
+                                  database='ZI_NEW', autocommit=True)
+    cursor_zi_new = conn_zi_new.cursor()
+    cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
+    invalid_list_fetch = cursor_zi_new.fetchall()
+    invalid_list = []
+    for invalid_tuple in invalid_list_fetch:
+        invalid_list.append(invalid_tuple[0])
    # 处理缺失值
    valid_index=[]
    for i in df.index:

--- a/公共代码/产品类别分析.py
+++ b/公共代码/产品类别分析.py
@@ -13,23 +13,41 @@ import pymssql
 import time

 # category为产品类型
-# a是同一品牌或父品牌下产品数量占产品总数量的百分比，作为阈值，a越大，有异常的产品越多；b和a一样，只是用于产品数据类型和参数
-# c_list是产品参数中，数据类型较为统一的参数 在excel列名中的位置，从0开始，必须是一个list
+# b是阈值，用于数据类型，数据格式；比如当某列数据类型占总数据量比例小于b时，该列拥有数据类型的数据索引将被作为异常数据返回


 def class_washing(category, filepath, b=0.01):
-    df_null=pd.read_excel(".\\异常数据表格.xlsx")
-    invalid_list = df_null['异常数据名称'].values
+    conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
+                                  database='ZI_NEW', autocommit=True)
+    cursor_zi_new = conn_zi_new.cursor()
+    # 一些数据写着‘暂无数据’，‘无参数，需补充’等替代值，这些替代值从外部表格被提取到invalid_list中
+    cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
+    invalid_list_fetch=cursor_zi_new.fetchall()
+    invalid_list=[]
+    for invalid_tuple in invalid_list_fetch:
+        invalid_list.append(invalid_tuple[0])
+
    df = pd.read_excel(filepath,converters = {'产品编码':str})
    df.drop(columns='Unnamed: 0', axis=1, inplace=True)

-    # 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
+    # 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index. 数据类型的例子：string, float, int
    t1=time.time()
    print('开始检测数据类型')
-    dtype_minor_dict = {}
-    for col in df_null['数据类型异常'][df_null['类别']==category][df_null['数据类型异常'].notnull()].values:
-        type_list = {}
-        valid_index = []
+
+    dtype_minor_dict = {}  # 该字典键为异常数据索引，值为异常数据类型出现的列名，值是字符串而非列表，多个列名由空格隔开
+    #   #从外部表提取用于判断‘数据类型异常’的列，放入dtype_col_list
+    cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='数据类型异常'")
+    dtype_col_fetch=cursor_zi_new.fetchall()
+    if not dtype_col_fetch:
+        print('该类别无‘数据类型异常’列名，或者类别名输入有误，请查证数据库')
+        return
+    dtype_col_list=[]
+    for dtype_tuple in dtype_col_fetch:
+        dtype_col_list.append(dtype_tuple[0])
+    #   #按列找
+    for col in dtype_col_list:
+        type_list = {}  # 键为该列出现的数据类型，值为该类型出现的次数
+        valid_index = []  # 值为‘暂无数据’等替代值的数据被忽略，值为实际值的数据索引提取到valid_index
        for i in df.index:
            if df.loc[i,col] in invalid_list:
                continue
@@ -41,6 +59,7 @@ def class_washing(category, filepath, b=0.01):
                type_list[data_type] += 1
        for data_type_i in type_list:
            if type_list[data_type_i] < len(valid_index) * b:
+                # 下面这一段可以优化，可以在type_list字典中存储每个数据类型对应的数值索引，先记下，优化阶段再改
                for i in valid_index:
                    if type(df.loc[i][col]) == data_type_i:
                        if i in dtype_minor_dict.keys():
@@ -48,10 +67,11 @@ def class_washing(category, filepath, b=0.01):
                        elif i not in dtype_minor_dict.keys():
                            dtype_minor_dict[i] = col

+    #   #将dtype_minor_dict字典的键，也就是异常数据索引，放入dtype_minor
    dtype_minor=[]
    dtype_minor.extend(dtype_minor_dict.keys())

-    # 在检测完产品数据类型后，将所有数据类型转换为string
+    # 在检测完产品数据类型后，将所有数据类型转换为string，因为后面的操作都是对于字符串的，如果有数据类型不是字符串就比较麻烦
    for col in df.columns:
        df[col] = df[col].astype(str)

@@ -59,21 +79,20 @@ def class_washing(category, filepath, b=0.01):
    t2=time.time()
    print(t2-t1)
    print('开始检测错误类别和错误名称')
-    wrong_class = []
-    not_in_name = []
+    wrong_class = []  # 类型错误数据索引
+    not_in_name = []  # 名称中不带类型的数据索引
    for i in df.index:
        if df.loc[i, '产品类别'] != category:
            wrong_class.append(i)
        if category not in df.loc[i, '产品名称']:
            not_in_name.append(i)

-    # 检测品牌中是否有不在category下对应的brand_id的产品品牌
+    # 检测品牌中有哪些不在数据表中该类别下属品牌中
    t3=time.time()
    print(t3-t2)
    print('开始检测错误品牌')
-    conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
-                                  database='ZI_NEW', autocommit=True)
-    cursor_zi_new = conn_zi_new.cursor()
+
+    #   #提取类别id
    cursor_zi_new.execute(f"select id from p_category where name='{category}'")
    category_id=cursor_zi_new.fetchone()
    if not category_id:
@@ -81,12 +100,14 @@ def class_washing(category, filepath, b=0.01):
        return
    category_id = category_id[0]

+    #   #找到该类别下的品牌的brand_id
    cursor_zi_new.execute(f"select brandid from p_spu where categoryid={category_id}")
    brand_id_fetchall=cursor_zi_new.fetchall()
    brand_id_list = []
    for brand_tuple in brand_id_fetchall:
        brand_id_list.append(brand_tuple[0])

+    #   #从brand_id找到品牌名（为什么不直接从category_id找品牌名呢？问得好，没有这个表）
    brand_name_list = []
    for brand_id in brand_id_list:
        cursor_zi_new.execute(f"select name from p_brand where id={brand_id}")
@@ -94,84 +115,48 @@ def class_washing(category, filepath, b=0.01):
        if brand_name_fetch:
            brand_name_list.append(brand_name_fetch[0].strip("'"))

+    #   #如果哪条数据品牌没在品牌列表中，作为异常值返回
    wrong_brand=[]
    for i in df.index:
        if df.loc[i]['产品品牌'] not in brand_name_list:
            wrong_brand.append(i)
-        # cursor_zi_new.execute(f"select id from p_brand where name='{df.loc[i]['产品品牌']}'")
-        # brand_id=cursor_zi_new.fetchone()
-        # if not brand_id:
-        #     wrong_brand.append(i)
-        #     continue
-        # brand_id=brand_id[0]
-        # if brand_id not in brand_id_list:
-        #     wrong_brand.append(i)
-
-
-    # # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
-    # father_brand_minor = []
-    # father_brand_list = []
-    # col='产品父品牌'
-    # valid_index=[]
-    # for i in df.index:
-    #     if df.loc[i, col] in invalid_list:
-    #         continue
-    #     valid_index.append(i)
-    # valid_df=df.loc[valid_index]
-    # father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count()  # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
-    # father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
-    # for i in father_brand_num.index:  # i 就是产品父品牌
-    #     if father_brand_num.loc[i] in father_num_list:  # father_brand_num.loc[i] 就是该父品牌出现次数
-    #         father_brand_list.append(i)
-    #
-    # for i in valid_df.index:
-    #     if valid_df.loc[i, '产品父品牌'] in father_brand_list:
-    #         father_brand_minor.append(i)
-    #
-    # # 检测产品品牌中品牌出现次数少的产品
-    # brand_minor = []
-    # brand_list = []
-    # col = '产品品牌'
-    # valid_index = []
-    # for i in df.index:
-    #     if df.loc[i, col] in invalid_list:
-    #         continue
-    #     valid_index.append(i)
-    # valid_df=df.loc[valid_index]
-    # brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count()  # 同上
-    # num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
-    # for i in brand_num.index:
-    #     if brand_num.loc[i] in num_list:
-    #         brand_list.append(i)
-    #
-    # for i in valid_df.index:
-    #     if valid_df.loc[i, '产品品牌'] in brand_list:
-    #         brand_minor.append(i)
-
-    # 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品，7是第一个产品参数列，-2是质保时间，-1是产品型号
+
+    # 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品
    t4=time.time()
    print(t4-t3)
    print('开始检测错误长度')
+
    length_minor_dict = {}
-    for col in df_null['数据长度异常'][df_null['类别']==category][df_null['数据长度异常'].notnull()].values:
-        col_length = []
-        valid_index=[]
+    #   #从外部表找到用于数据长度判断的列名
+    cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='数据长度异常'")
+    length_col_fetch = cursor_zi_new.fetchall()
+    if not length_col_fetch:
+        print('该类别无‘数据长度异常’列名，或者类别名输入有误，请查证数据库')
+        return
+    length_col_list = []
+    for length_tuple in length_col_fetch:
+        length_col_list.append(length_tuple[0])
+
+    #   #按列找
+    for col in length_col_list:
+        col_length = []  # 包含该列每条数据的长度
+        valid_index=[]  # 同上
        for i in df.index:
            if df.loc[i, col] in invalid_list:
                continue
            valid_index.append(i)
            col_length.append(len(df.loc[i, col]))
-        if col_length:
+        if col_length:  # 有时候一列数据全都是无效数据所以加个条件
            std = np.array(col_length).std()
            mean = np.array(col_length).mean()
            for counter, length in enumerate(col_length):
                if length < mean - 2 * std or length > mean + 2 * std:
-                    # length_minor_dict[valid_index[counter]]=col
-                    index=valid_index[counter]
+                    index=valid_index[counter]  # valid_index和col_length是一一对应的
                    if index in length_minor_dict.keys():
                        length_minor_dict[index] += ' ' + col
                    elif index not in length_minor_dict.keys():
                        length_minor_dict[index] = col
+
    length_minor=[]
    length_minor.extend(length_minor_dict.keys())

@@ -179,8 +164,21 @@ def class_washing(category, filepath, b=0.01):
    t5=time.time()
    print(t5-t4)
    print('开始检测错误数据格式')
+
+    #   #从外部表中提取用于数据格式判断的列名
    format_minor_dict = {}
-    for col in df_null['数据格式异常'][df_null['类别']==category][df_null['数据格式异常'].notnull()].values:
+    cursor_zi_new.execute(
+        f"select col_value from data_washing_external where category_name='{category}' and col_name='数据格式异常'")
+    format_col_fetch = cursor_zi_new.fetchall()
+    if not format_col_fetch:
+        print('该类别无‘数据格式异常’列名，或者类别名输入有误，请查证数据库')
+        return
+    format_col_list = []
+    for format_tuple in format_col_fetch:
+        format_col_list.append(format_tuple[0])
+
+    #   #按列找
+    for col in format_col_list:
        counter_dict = {}
        valid_index = []
        for i in df.index:
@@ -218,19 +216,17 @@ def class_washing(category, filepath, b=0.01):

    format_minor=[]
    format_minor.extend(format_minor_dict.keys())
-        # length_record = []
-        # for keys in counter_dict:
-        #     if not length_record:
-        #         length_record.append([len(counter_dict[keys]), counter_dict[keys]])
-        #     elif len(counter_dict[keys]) < length_record[0][0]:
-        #         length_record[0] = [len(counter_dict[keys]), counter_dict[keys]]
-        #
-        # format_minor += length_record[0][1]
-
-    # 对于产品名称中没有“扫描仪”的，如果没有“高拍仪”就挑出来
+
+    # 对于产品名称的第二次筛选，比如在扫描仪中，对于产品名称中没有“扫描仪”的，如果没有“高拍仪”就挑出来
+    cursor_zi_new.execute(
+        f"select col_value from data_washing_external where category_name='{category}' and col_name='产品名称异常'")
+    name_col_fetch = cursor_zi_new.fetchall()
+    name_col_list = []
+    for name_tuple in name_col_fetch:
+        name_col_list.append(name_tuple[0])
    not_in_name2 = []
    for i in not_in_name:
-        for special_name in df_null['产品名称异常'][df_null['类别']==category][df_null['产品名称异常'].notnull()].values:
+        for special_name in name_col_list:
            if special_name in df.loc[i,'产品名称']:
                break
        not_in_name2.append(i)
@@ -239,8 +235,18 @@ def class_washing(category, filepath, b=0.01):
    t6=time.time()
    print(t6-t5)
    print('开始检测标准参数')
+
+    cursor_zi_new.execute(
+        f"select col_value from data_washing_external where category_name='{category}' and col_name='标准参数异常'")
+    stdparam_col_fetch = cursor_zi_new.fetchall()
+    if not stdparam_col_fetch:
+        print('该类别无‘标准参数异常’列名，或者类别名输入有误，请查证数据库。代码继续运行')
+    stdparam_col_list = []
+    for stdparam_tuple in stdparam_col_fetch:
+        stdparam_col_list.append(stdparam_tuple[0])
+
    character_minor_dict = {}
-    for col_i in df_null['标准参数异常'][df_null['类别']==category][df_null['标准参数异常'].notnull()].values:
+    for col_i in stdparam_col_list:
        temp_list = []
        cursor_zi_new.execute(f"select stdvalue from ShuJuZiDian_Cfg where categoryname='{category}' and subtitle='{col_i.strip('*')}'")
        standard_value_fetchall=cursor_zi_new.fetchall()
@@ -265,24 +271,6 @@ def class_washing(category, filepath, b=0.01):
    for keys_i in character_minor_dict:
        character_minor.extend(character_minor_dict[keys_i])

-
-
-
-
-    #     cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
-    #     num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
-    #     for i in cha_num.index:
-    #         if cha_num.loc[i] in num_list:
-    #             tempo_list.append(i)
-    #     for i in valid_df.index:
-    #         if valid_df.loc[i, col_i] in tempo_list:
-    #             tempo_list2.append(i)
-    #     character_minor_dict[col_i] = tempo_list2
-    #
-    # character_minor = []
-    # for keys_i in character_minor_dict:
-    #     character_minor.extend(character_minor_dict[keys_i])
-
    t7=time.time()
    print(t7-t6)
    print('开始整合数据')
@@ -292,7 +280,6 @@ def class_washing(category, filepath, b=0.01):
    index_minor.extend(format_minor)
    index_minor.extend(length_minor)
    index_minor.extend(wrong_brand)
-    # index_minor.extend(father_brand_minor)
    index_minor.extend(not_in_name2)
    index_minor.extend(dtype_minor)
    index_minor.extend(character_minor)
@@ -301,13 +288,25 @@ def class_washing(category, filepath, b=0.01):
    final_df = pd.DataFrame(np.zeros((len(index_minor), 7)), index=list(index_minor),
                            columns=['计数', '产品类别异常', '产品名称异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])

-    w_class = df_null['产品类别异常权重'][df_null['类别']==category][df_null['产品类别异常权重'].notnull()].values
-    w_format = df_null['数据格式异常权重'][df_null['类别']==category][df_null['数据格式异常权重'].notnull()].values
-    w_length = df_null['数据长度异常权重'][df_null['类别']==category][df_null['数据长度异常权重'].notnull()].values
-    w_brand = df_null['品牌异常权重'][df_null['类别']==category][df_null['品牌异常权重'].notnull()].values
-    w_name = df_null['产品名称异常权重'][df_null['类别']==category][df_null['产品名称异常权重'].notnull()].values
-    w_dtype = df_null['数据类型异常权重'][df_null['类别']==category][df_null['数据类型异常权重'].notnull()].values
-    w_stdparam = df_null['标准参数异常权重'][df_null['类别']==category][df_null['标准参数异常权重'].notnull()].values
+    weight_list=[]
+    weight_colname_list=['产品类别异常权重','数据格式异常权重','数据长度异常权重','品牌异常权重','产品名称异常权重','数据类型异常权重','标准参数异常权重']
+    for col_name in weight_colname_list:
+        cursor_zi_new.execute(
+            f"select col_value from data_washing_external where category_name='{category}' and col_name='{col_name}'")
+        weight_fetch=cursor_zi_new.fetchone()
+        if weight_fetch:
+            weight_list.append(weight_fetch[0])
+        else:
+            weight_list.append(1.0)
+
+    w_class = float(weight_list[0])
+    w_format = float(weight_list[1])
+    w_length = float(weight_list[2])
+    w_brand = float(weight_list[3])
+    w_name = float(weight_list[4])
+    w_dtype = float(weight_list[5])
+    w_stdparam = float(weight_list[6])
+
    for i in index_minor:
        count = 0
        if i in wrong_class:
@@ -322,9 +321,6 @@ def class_washing(category, filepath, b=0.01):
        if i in wrong_brand:
            count += w_brand
            final_df.loc[i, '品牌异常'] = 1
-        # if i in father_brand_minor:
-        #     count += 1
-        #     final_df.loc[i, '父品牌异常'] = 1
        if i in not_in_name2:
            count += w_name
            final_df.loc[i, '产品名称异常'] = 1
@@ -347,8 +343,4 @@ def class_washing(category, filepath, b=0.01):
 if __name__ == '__main__':
    category='扫描仪'
    filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
-    #c_list=[6,7,-4,-3]
-    # category = '扫描仪'
-    # filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
-    # c_list=[7,8,9]
    class_washing(category, filepath)
--- a/公共代码/产品重复型号分析.py
+++ b/公共代码/产品重复型号分析.py
@@ -10,16 +10,29 @@ import pandas as pd
 import re
 import numpy as np
 import xlsxwriter
-
-
-def product_washing(filepath, category,thre=1, a=0):
-    df_null = pd.read_excel(".\\异常数据表格.xlsx")
-    invalid_list = df_null['异常数据名称'].values
+import pymssql
+
+
+def product_washing(filepath, category, thre=1, a=0):
+    conn_zi_new = pymssql.connect(host='123.56.115.207', user='zgcprice3311', password='zgcprice20200628',
+                                  database='ZI_NEW', autocommit=True)
+    cursor_zi_new = conn_zi_new.cursor()
+    cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'")
+    invalid_list_fetch = cursor_zi_new.fetchall()
+    invalid_list = []
+    for invalid_tuple in invalid_list_fetch:
+        invalid_list.append(invalid_tuple[0])
    df=pd.read_excel(filepath, converters={'产品编码':str})
    df.drop(columns='Unnamed: 0', axis=1, inplace=True)
    for col in df.columns:
        df[col]=df[col].astype(str)

+    cursor_zi_new.execute(f"select col_value from data_washing_external where category_name='{category}' and col_name='重复参数项'")
+    other_parameters_fetch=cursor_zi_new.fetchall()
+    other_parameters=[]
+    for param in other_parameters_fetch:
+        other_parameters.append(param[0])
+
    related_product = []
    brand_grouped = df.groupby(by='产品品牌')
    for brand in brand_grouped:
@@ -86,7 +99,7 @@ def product_washing(filepath, category,thre=1, a=0):
            # brand_combined = temp_list1+temp_list2

            tempo_dict[i] = [set(combined)]
-            other_parameters=df_null['重复参数项'][df_null['类别']==category][df_null['重复参数项'].notnull()].values
+
            other_parameters_values=[]
            for parameter in other_parameters:
                other_parameters_values.append(brand[1].loc[i,parameter])

--- a/公共代码/导出库内参数数据.py
+++ b/公共代码/导出库内参数数据.py
@@ -162,5 +162,5 @@ def get_point_category_params_data(category):
    conn_zi_new.close()


-category = '扫描仪'
+category = '激光打印机'
 get_point_category_params_data(category)
\ No newline at end of file
--- a/公共代码/爬取数据分析.xlsx
+++ b/公共代码/爬取数据分析.xlsx