李佳林

e6658a7a · Jialin · ed7d80ba · e6658a7a · e6658a7a · e6658a7a
Commit e6658a7a authored Apr 01, 2021 by Jialin
4 changed files
--- a/公共代码/爬虫信息分析.py
+++ b/公共代码/爬虫信息分析.py
+#!usr/bin/env python
+# -*- coding:utf-8 -*-
+"""
+@author: dell
+@file: after_lijie.py
+@time: 2021/03/31
+@desc:
+"""
+import pandas as pd
+import numpy as np
+import re
+import xlsxwriter
+
+def pachong_washing(filepath):
+    df=pd.read_excel(filepath, converters={'产品编码':str})
+    #df.drop(columns='Unnamed: 0', axis=1, inplace=True)
+    for col in ['补充后型号','爬取名称']:
+        df[col]=df[col].astype(str)
+
+    # 爬取名称关键词提取
+    comparing_df=pd.DataFrame()
+    for i in df.index:
+        df.loc[i,'爬取名称']=df.loc[i,'爬取名称'].upper()
+        k=df.loc[i,'爬取名称']
+
+        pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
+        if pre_num:
+            pre_num = pre_num.group(1)
+        if not pre_num:
+            pre_num=''
+
+        num = re.search(r'(\d+)', k)  # num为数字关键字
+        if num:
+            num = num.group(1)  # 如果连续数字超过1处，我们只提取第一处，这个可能有点问题但目前还不知道怎么办
+        if not num:  # 如果没有数字，就比较英文单词
+            num=re.findall(r'[A-Za-z]+', k)
+            num=''.join(num)
+
+        pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
+                             k)  # pos_num为数字后的关键字
+        if pos_num:
+            pos_num = ''.join(pos_num[0])
+            if '升级版' in k:
+                pos_num += '升级版'
+            if '专业版' in k:
+                pos_num += '专业版'
+            if '教育版' in k:
+                pos_num += '教育版'
+            if '+' in k:
+                pos_num += '+'
+
+        if not pos_num:
+            pos_num=''
+
+        comparing_df.loc[i, '爬取数据pre_num'] = pre_num
+        if num:
+            comparing_df.loc[i, '爬取数据num'] = num
+        # if not num:
+        #     comparing_df.loc[i, '爬取数据alpha'] = alpha
+        comparing_df.loc[i, '爬取数据pos_num'] = pos_num
+
+    for i in df.index:
+        df.loc[i, '补充后型号'] = df.loc[i, '补充后型号'].upper()
+        k=df.loc[i, '补充后型号']
+
+        pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
+        if pre_num:
+            pre_num = pre_num.group(1)
+        if not pre_num:
+            pre_num = ''
+
+        num = re.search(r'(\d+)', k)  # num为数字关键字
+        if num:
+            num = num.group(1)  # 如果连续数字超过1处，我们只提取第一处，这个可能有点问题但目前还不知道怎么办
+        if not num:  # 如果没有数字，就比较英文单词
+            num = re.findall(r'[A-Za-z]+', k)
+            num = ''.join(num)
+
+        pos_num = re.findall(
+            r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
+            k)  # pos_num为数字后的关键字
+        if pos_num:
+            pos_num = ''.join(pos_num[0])
+            if '升级版' in k:
+                pos_num += '升级版'
+            if '专业版' in k:
+                pos_num += '专业版'
+            if '教育版' in k:
+                pos_num += '教育版'
+            if '+' in k:
+                pos_num += '+'
+        if not pos_num:
+            pos_num=''
+
+        comparing_df.loc[i, '补充型号pre_num'] = pre_num
+        if num:
+            comparing_df.loc[i, '补充型号num'] = num
+        if not num:
+            comparing_df.loc[i, '补充型号alpha'] = alpha
+        comparing_df.loc[i, '补充型号pos_num'] = pos_num
+
+    for i in df.index:
+        if comparing_df.loc[i,'补充型号pre_num'] != '' and comparing_df.loc[i,'爬取数据pre_num'] != '':
+            if comparing_df.loc[i,'补充型号pre_num'] != comparing_df.loc[i,'爬取数据pre_num']:
+                for col in df.columns[15:21]:
+                    df.loc[i,col] = '暂无数据'
+                continue
+
+        if comparing_df.loc[i,'补充型号num'] != comparing_df.loc[i,'爬取数据num']:
+            for col in df.columns[15:21]:
+                df.loc[i, col] = '暂无数据'
+            continue
+
+
+        if comparing_df.loc[i,'补充型号pos_num'] != comparing_df.loc[i,'爬取数据pos_num']:
+            for col in df.columns[15:21]:
+                df.loc[i, col] = '暂无数据'
+            continue
+
+    df.to_excel('./after_lijie.xlsx')
+
+
+if __name__ == '__main__':
+    filepath = "E:\\ZDZC\\扫描仪参数确认(爬虫).xlsx"
+    pachong_washing(filepath)
\ No newline at end of file
--- a/公共代码/错误产品分析.py
+++ b/公共代码/错误产品分析.py
+#!usr/bin/env python
+# -*- coding:utf-8 -*-
+"""
+@author: dell
+@file: product_filter.py
+@time: 2021/03/29
+@desc:
+"""
+import pandas as pd
+import re
+import numpy as np
+import xlsxwriter
+
+
+def product_washing(filepath, thre=1, a=0):
+    df=pd.read_excel(filepath, converters={'产品编码':str})
+    df.drop(columns='Unnamed: 0', axis=1, inplace=True)
+    for col in df.columns:
+        df[col]=df[col].astype(str)
+
+    related_product = []
+    brand_grouped = df.groupby(by='产品品牌')
+    for brand in brand_grouped:
+        if brand[0]=='无参数，需补充':
+            continue
+        tempo_dict = {}
+        # 每个品牌提取产品型号关键字，放入tempo_dict
+        for i in brand[1].index:
+            k = brand[1].loc[i, '产品型号']
+            if k=='无参数，需补充':
+                continue
+            pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
+            if pre_num:
+                pre_num = [pre_num.group(1)]
+            if not pre_num:
+                pre_num = []
+
+            num = re.search(r'(\d+)', k)  # num为数字关键字
+            if num:
+                num = [num.group(1)]  # 如果连续数字超过1处，我们只提取第一处，这个可能有点问题但目前还不知道怎么办
+            if not num:  # 如果没有数字，就比较英文单词
+                num = re.findall(r'[A-Za-z]+', k)
+
+            pos_num = re.findall(
+                r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
+                k)  # pos_num为数字后的关键字
+            if pos_num:
+                pos_num = list(pos_num[0])
+                if '升级版' in k:
+                    pos_num += '升级版'
+                if '专业版' in k:
+                    pos_num += '专业版'
+                if '教育版' in k:
+                    pos_num += '教育版'
+                if '+' in k:
+                    pos_num += '+'
+
+            combined = pre_num + num + pos_num  # 将关键字列表合并
+            # pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
+            # if not pre_num:
+            #     pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
+            # if not pre_num:
+            #     pre_num = re.findall(r'\b([A-Za-z]{0,4})\W?\d+', k)
+            # if pre_num:
+            #     pre_num = [pre_num[0]]
+            #
+            # num = re.findall(r'\d+', k)  # num为数字关键字
+            # if num:
+            #     num = [num[0]]
+            # pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
+            #                      k)  # pos_num为数字后的关键字
+            # if pos_num:
+            #     pos_num = list(pos_num[0])
+            #
+            # combined = pre_num + num + pos_num  # 将关键字列表合并
+
+            # 提取品牌名关键字
+            temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper())  # 提取汉字
+            while '新建品牌' in temp_list1:
+                temp_list1.remove('新建品牌')  # 去除‘新建品牌’
+            temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper())  # 提取英文单词
+            brand_combined = temp_list1+temp_list2
+
+            while '' in combined:
+                combined.remove('')
+
+            tempo_dict[i] = [set(brand_combined), set(combined)]
+
+        # 对比产品型号关键字，相同则放入related_product
+        tested_product=[]
+        for i in tempo_dict:
+            for j in tempo_dict:
+                if i != j and set([i,j]) not in tested_product:
+                    if tempo_dict[i][0]==tempo_dict[j][0]:
+                        accuracy_i=0
+                        accuracy_j=0
+                        for word_i in tempo_dict[i][1]:
+                            if word_i in tempo_dict[j][1]:
+                                accuracy_i += 1/(len(tempo_dict[i][1]) + a/len(tempo_dict[i][1]))
+                                accuracy_j += 1/(len(tempo_dict[j][1]) + a/len(tempo_dict[j][1]))
+                        if accuracy_i >= thre or accuracy_j >= thre:
+                            if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
+                                related_product.append(set([i,j]))
+                    tested_product.append(set([i,j]))
+
+            # a = set([i])
+            # for j in tempo_dict:
+            #     if tempo_dict[i] == tempo_dict[j]:
+            #         a.add(j)
+            # if len(a) > 1:
+            #     related_product.append(a)
+        # for i in tempo_dict:
+        #     for j in tempo_dict:
+        #         if i != j:
+        #             if tempo_dict[i] == tempo_dict[j]:
+        #                 related_product.append(set([i, j]))
+
+    # # 这一步为去重
+    # product_unique=[]
+    # for item in related_product:
+    #     if item not in product_unique:
+    #         product_unique.append(item)
+
+    # 将数据导出到excel表格。重复的产品数据两两并列 和其他的重复数据组中间空一行
+    workbook = xlsxwriter.Workbook('./product_filter.xlsx')
+    bold_format = workbook.add_format({'bold': True})
+    worksheet = workbook.add_worksheet()
+    col = 0
+    row = 0
+    worksheet.write_string(row, col, 'Index', bold_format)
+    col += 1
+    for column in df.columns:
+        worksheet.write_string(row, col, column, bold_format)
+        col += 1
+    worksheet.write_string(row, col, '正确产品编号', bold_format)
+    row = 1
+    col = 0
+    for item in related_product:
+        for inner_item in item:
+            worksheet.write_string(row, col, str(inner_item))
+            for value in df.loc[inner_item].values:
+                col += 1
+                worksheet.write_string(row, col, value)
+            col = 0
+            row += 1
+        row += 1
+
+    workbook.close()
+
+
+if __name__ == '__main__':
+    filepath = "E:\\ZDZC\\激光打印机参数确认.xlsx"
+    product_washing(filepath)
--- a/公共代码/错误品牌分析.py
+++ b/公共代码/错误品牌分析.py
--- a/公共代码/错误类别分析.py
+++ b/公共代码/错误类别分析.py
+#!usr/bin/env python
+# -*- coding:utf-8 -*-
+"""
+@author: dell
+@file: class_washing.py
+@time: 2021/03/26
+@desc:
+"""
+import pandas as pd
+import re
+import numpy as np
+
+# category为产品类型
+# a是同一品牌或父品牌下产品数量占产品总数量的百分比，作为阈值，a越大，有异常的产品越多；b和a一样，只是用于产品数据类型和参数
+# c_list是产品参数中，数据类型较为统一的参数 在excel列名中的位置，从0开始，必须是一个list
+
+
+def class_washing(category, filepath, c_list,a=0.02, b=0.01):
+    df = pd.read_excel(filepath,converters = {'产品编码':str})
+    df.drop(columns='Unnamed: 0', axis=1, inplace=True)
+    # 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
+    dtype_minor = []
+    for col in df.columns:
+        type_list = {}
+        valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]
+        for i in valid_index:
+            data_type = type(df.loc[i, col])
+            if data_type not in type_list:
+                type_list[data_type] = 1
+            elif data_type in type_list:
+                type_list[data_type] += 1
+        for data_type_i in type_list:
+            if type_list[data_type_i] < len(valid_index) * b:
+                for i in valid_index:
+                    if type(df.loc[i][col]) == data_type_i:
+                        dtype_minor.append(i)
+
+    # 在检测完产品数据类型后，将所有数据类型转换为string
+    for col in df.columns:
+        df[col] = df[col].astype(str)
+
+    # 检测产品类型错误的产品，和产品名称中不带有产品类型的产品。由于代码简单，就放在一起了
+    wrong_class = []
+    not_in_name = []
+    for i in df.index:
+        if df.loc[i, '产品类别'] != category:
+            wrong_class.append(i)
+        if category not in df.loc[i, '产品名称']:
+            not_in_name.append(i)
+
+    # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
+    father_brand_minor = []
+    father_brand_list = []
+    col='产品父品牌'
+    valid_df=df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]]
+    father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count()  # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
+    father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
+    for i in father_brand_num.index:  # i 就是产品父品牌
+        if father_brand_num.loc[i] in father_num_list:  # father_brand_num.loc[i] 就是该父品牌出现次数
+            father_brand_list.append(i)
+
+    for i in valid_df.index:
+        if valid_df.loc[i, '产品父品牌'] in father_brand_list:
+            father_brand_minor.append(i)
+
+    # 检测产品品牌中品牌出现次数少的产品
+    brand_minor = []
+    brand_list = []
+    col = '产品品牌'
+    valid_df = df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]]
+    brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count()  # 同上
+    num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
+    for i in brand_num.index:
+        if brand_num.loc[i] in num_list:
+            brand_list.append(i)
+
+    for i in valid_df.index:
+        if valid_df.loc[i, '产品品牌'] in brand_list:
+            brand_minor.append(i)
+
+    # 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品，7是第一个产品参数列，-2是质保时间，-1是产品型号
+    length_minor = []
+    for col in df.columns[7:-2]:
+        col_length = []
+        valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]
+        for i in valid_index:
+            col_length.append(len(df.loc[i, col]))
+        std = np.array(col_length).std()
+        mean = np.array(col_length).mean()
+        for counter, length in enumerate(col_length):
+            if length < mean - 2 * std or length > mean + 2 * std:
+                length_minor.append(valid_index[counter])
+
+    # 检测产品参数列数据格式小于总数量的b的产品
+    format_minor = []
+    for col in df.columns[7:-2]:
+        counter_dict = {}
+        valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]
+        for i in valid_index:
+            counter_list = []
+            k = df.loc[i, col]
+            is_str = re.findall(r'[A-Za-z]+', k)
+            is_dig = re.findall(r'[0-9]+', k)
+            is_special = re.findall(r'\W+', k)
+            is_chinese = re.findall(r'[\u4e00-\u9fa5]+', k)
+            if is_str:
+                counter_list.append('str')
+            if is_dig:
+                counter_list.append('dig')
+            if is_special:
+                counter_list.append('special')
+            if is_chinese:
+                counter_list.append('chinese')
+            combined = ''.join(counter_list)
+            if combined not in counter_dict:
+                counter_dict[combined] = [i]
+            elif combined in counter_dict:
+                counter_dict[combined].append(i)
+
+        for keys in counter_dict:
+            if len(counter_dict[keys]) < len(valid_index)*b:
+                format_minor.extend(counter_dict[keys])
+        # length_record = []
+        # for keys in counter_dict:
+        #     if not length_record:
+        #         length_record.append([len(counter_dict[keys]), counter_dict[keys]])
+        #     elif len(counter_dict[keys]) < length_record[0][0]:
+        #         length_record[0] = [len(counter_dict[keys]), counter_dict[keys]]
+        #
+        # format_minor += length_record[0][1]
+
+    # 接下来是针对扫描仪的部分.对于特定产品，还可以从每个产品参数中选出少数派。如果知道易混淆的产品类型，还要特意加上挑取易混淆产品类型的代码
+    # 对于产品名称中没有“扫描仪”的，如果没有“高拍仪”就挑出来
+    not_in_name2 = []
+    for i in not_in_name:
+        if '高拍仪' not in df.loc[i,'产品名称']:
+            not_in_name2.append(i)
+
+    # 对于产品参数中，数据类型较少的参数，其中如果有数量小于产品总数量的b的，挑出来
+    character_minor_dict = {}
+    for col_i in df.columns[c_list]:
+        tempo_list = []
+        tempo_list2 = []
+        valid_df=df.loc[df.index[(df[col_i] != '暂无数据') == ((df[col_i] != '无参数，需补充') == (df[col_i].notnull()))]]
+        cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
+        num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
+        for i in cha_num.index:
+            if cha_num.loc[i] in num_list:
+                tempo_list.append(i)
+        for i in valid_df.index:
+            if valid_df.loc[i, col_i] in tempo_list:
+                tempo_list2.append(i)
+        character_minor_dict[col_i] = tempo_list2
+
+    character_minor = []
+    for keys_i in character_minor_dict:
+        character_minor.extend(character_minor_dict[keys_i])
+
+    # 将挑出的可疑数据整合到一起 (wrong_class没加，因为里面的肯定不对）
+    index_minor = []
+    index_minor.extend(wrong_class)
+    index_minor.extend(format_minor)
+    index_minor.extend(length_minor)
+    index_minor.extend(brand_minor)
+    index_minor.extend(father_brand_minor)
+    index_minor.extend(not_in_name2)
+    index_minor.extend(dtype_minor)
+    index_minor.extend(character_minor)
+    index_minor = set(index_minor)
+
+    final_df = pd.DataFrame(np.zeros((len(index_minor), 8)), index=list(index_minor),
+                            columns=['计数', '产品类型异常', '产品名称异常', '父品牌异常', '品牌异常', '数据类型异常', '数据格式异常', '数据长度异常'])
+    for i in index_minor:
+        count = 0
+        if i in wrong_class:
+            count += 1
+            final_df.loc[i, '产品类型异常'] = 1
+        if i in format_minor:
+            count += 1
+            final_df.loc[i, '数据格式异常'] = 1
+        if i in length_minor:
+            count += 1
+            final_df.loc[i, '数据长度异常'] = 1
+        if i in brand_minor:
+            count += 1
+            final_df.loc[i, '品牌异常'] = 1
+        if i in father_brand_minor:
+            count += 1
+            final_df.loc[i, '父品牌异常'] = 1
+        if i in not_in_name2:
+            count += 1
+            final_df.loc[i, '产品名称异常'] = 1
+        if i in dtype_minor:
+            count += 1
+            final_df.loc[i, '数据类型异常'] = 1
+        for keys_i in character_minor_dict:
+            if i in character_minor_dict[keys_i]:
+                final_df.loc[i, keys_i + '异常'] = 1
+                count += 1
+            else:
+                final_df.loc[i, keys_i + '异常'] = 0
+        final_df.loc[i, '计数'] = count
+
+    final_df = pd.merge(final_df, df, how='left', left_index=True, right_index=True)
+    final_df = final_df.sort_values(by='计数', ascending=False)
+    final_df.to_excel("./class_filter.xlsx")
+
+
+if __name__ == '__main__':
+    category='激光打印机'
+    filepath="E:\\ZDZC\\激光打印机参数确认.xlsx"
+    c_list=[6,7,-4,-3]
+    class_washing(category, filepath, c_list)