代码修改

d54af336 · Jialin · a5316846 · d54af336 · d54af336 · d54af336
Commit d54af336 authored Apr 12, 2021 by Jialin
5 changed files
--- a/公共代码/产品品牌分析.py
+++ b/公共代码/产品品牌分析.py
@@ -3,7 +3,6 @@


 import pandas as pd
-import numpy as np
 import re
 import xlsxwriter

@@ -11,10 +10,14 @@ import xlsxwriter
 def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    # filepath:文件路径，thre为两个品牌下型号重合率阈值，inner_thre为两个品牌下某条型号内关键词重合率阈值，a为权重调整，sheet_name为表单名
    df = pd.read_excel(filepath, sheet_name=sheet_name, converters = {'产品编码':str})
+    df_null = pd.read_excel(".\\异常数据表格.xlsx")
+    invalid_list = df_null['异常数据名称'].values
    # 处理缺失值
-    col1=(df['产品品牌'] != '暂无数据') == ((df['产品品牌'] != '无参数，需补充') == (df['产品品牌'].notnull()))
-    col2=(df['产品型号'] != '暂无数据') == ((df['产品型号'] != '无参数，需补充') == (df['产品型号'].notnull()))
-    result = df.loc[df.index[col1==col2]]
+    valid_index=[]
+    for i in df.index:
+        if df.loc[i,'产品品牌'] not in invalid_list and df.loc[i,'产品型号'] not in invalid_list:
+            valid_index.append(i)
+    result = df.loc[valid_index]
    # 将df数据格式转为字符串
    for i in result.columns:
        result[i] = result[i].astype(str)
@@ -152,7 +155,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
                tempo_list.extend(word_list)
                related_brand3.append(tempo_list)

-    # 写入excel
+    #写入excel
    workbook = xlsxwriter.Workbook('./brand_filter.xlsx')
    bold_format = workbook.add_format({'bold': True})

@@ -162,11 +165,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
    worksheet.write('B1', '品牌B', bold_format)
    worksheet.write('C1', '正确品牌', bold_format)
    worksheet.write('D1', '方法', bold_format)
-    # worksheet.write('E1', '品牌B-2', bold_format)
-    # worksheet.write('F1', '品牌-2', bold_format)
-    # worksheet.write('G1', '品牌A-3', bold_format)
-    # worksheet.write('H1', '品牌B-3', bold_format)
-    # worksheet.write('I1', '品牌-3', bold_format)
    col = 0
    row = 1
    for list_i in related_brand1:
@@ -238,9 +236,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
            row -= 2
            col += 1
        row += 3
-
-
-
    workbook.close()

 if __name__ == '__main__':

--- a/公共代码/产品类别分析.py
+++ b/公共代码/产品类别分析.py
@@ -16,14 +16,19 @@ import numpy as np


 def class_washing(category, filepath, c_list,a=0.02, b=0.01):
+    df_null=pd.read_excel(".\\异常数据表格.xlsx")
+    invalid_list = df_null['异常数据名称'].values
    df = pd.read_excel(filepath,converters = {'产品编码':str})
    df.drop(columns='Unnamed: 0', axis=1, inplace=True)
    # 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
    dtype_minor_dict = {}
    for col in df.columns:
        type_list = {}
-        valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]
-        for i in valid_index:
+        valid_index = []
+        for i in df.index:
+            if df.loc[i,col] in invalid_list:
+                continue
+            valid_index.append(i)
            data_type = type(df.loc[i, col])
            if data_type not in type_list:
                type_list[data_type] = 1
@@ -58,7 +63,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
    father_brand_minor = []
    father_brand_list = []
    col='产品父品牌'
-    valid_df=df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]]
+    valid_index=[]
+    for i in df.index:
+        if df.loc[i, col] in invalid_list:
+            continue
+        valid_index.append(i)
+    valid_df=df.loc[valid_index]
    father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count()  # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
    father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
    for i in father_brand_num.index:  # i 就是产品父品牌
@@ -73,7 +83,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
    brand_minor = []
    brand_list = []
    col = '产品品牌'
-    valid_df = df.loc[df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]]
+    valid_index = []
+    for i in df.index:
+        if df.loc[i, col] in invalid_list:
+            continue
+        valid_index.append(i)
+    valid_df=df.loc[valid_index]
    brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count()  # 同上
    num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
    for i in brand_num.index:
@@ -88,9 +103,13 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
    length_minor_dict = {}
    for col in df.columns[7:-2]:
        col_length = []
-        valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]
-        for i in valid_index:
+        valid_index=[]
+        for i in df.index:
+            if df.loc[i, col] in invalid_list:
+                continue
+            valid_index.append(i)
            col_length.append(len(df.loc[i, col]))
+        if col_length:
            std = np.array(col_length).std()
            mean = np.array(col_length).mean()
            for counter, length in enumerate(col_length):
@@ -107,7 +126,11 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
    format_minor_dict = {}
    for col in df.columns[7:-2]:
        counter_dict = {}
-        valid_index = df.index[(df[col] != '暂无数据') == ((df[col] != '无参数，需补充') == (df[col].notnull()))]
+        valid_index = []
+        for i in df.index:
+            if df.loc[i, col] in invalid_list:
+                continue
+            valid_index.append(i)
        for i in valid_index:
            counter_list = []
            k = df.loc[i, col]
@@ -161,7 +184,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
    for col_i in df.columns[c_list]:
        tempo_list = []
        tempo_list2 = []
-        valid_df=df.loc[df.index[(df[col_i] != '暂无数据') == ((df[col_i] != '无参数，需补充') == (df[col_i].notnull()))]]
+        valid_index = []
+        for i in df.index:
+            if df.loc[i, col_i] in invalid_list:
+                continue
+            valid_index.append(i)
+        valid_df = df.loc[valid_index]
        cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
        num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
        for i in cha_num.index:

--- a/公共代码/产品重复型号分析.py
+++ b/公共代码/产品重复型号分析.py
@@ -13,6 +13,8 @@ import xlsxwriter


 def product_washing(filepath, thre=1, a=0):
+    df_null = pd.read_excel(".\\异常数据表格.xlsx")
+    invalid_list = df_null['异常数据名称'].values
    df=pd.read_excel(filepath, converters={'产品编码':str})
    df.drop(columns='Unnamed: 0', axis=1, inplace=True)
    for col in df.columns:
@@ -21,13 +23,13 @@ def product_washing(filepath, thre=1, a=0):
    related_product = []
    brand_grouped = df.groupby(by='产品品牌')
    for brand in brand_grouped:
-        if brand[0]=='无参数，需补充':
+        if brand[0] in invalid_list:
            continue
        tempo_dict = {}
        # 每个品牌提取产品型号关键字，放入tempo_dict
        for i in brand[1].index:
            k = brand[1].loc[i, '产品型号']
-            if k=='无参数，需补充':
+            if k in invalid_list:
                continue
            pre_num = re.search(r'([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
            if pre_num:
@@ -56,6 +58,8 @@ def product_washing(filepath, thre=1, a=0):
                    pos_num += '+'

            combined = pre_num + num + pos_num  # 将关键字列表合并
+            while '' in combined:
+                combined.remove('')
            # pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k)  # pre_num为数字前的关键字
            # if not pre_num:
            #     pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
@@ -74,30 +78,29 @@ def product_washing(filepath, thre=1, a=0):
            #
            # combined = pre_num + num + pos_num  # 将关键字列表合并

-            # 提取品牌名关键字
-            temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper())  # 提取汉字
-            while '新建品牌' in temp_list1:
-                temp_list1.remove('新建品牌')  # 去除‘新建品牌’
-            temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper())  # 提取英文单词
-            brand_combined = temp_list1+temp_list2
+            # # 提取品牌名关键字
+            # temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper())  # 提取汉字
+            # while '新建品牌' in temp_list1:
+            #     temp_list1.remove('新建品牌')  # 去除‘新建品牌’
+            # temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper())  # 提取英文单词
+            # brand_combined = temp_list1+temp_list2
+

-            while '' in combined:
-                combined.remove('')

-            tempo_dict[i] = [set(brand_combined), set(combined)]
+            tempo_dict[i] = [set(combined), brand[1].loc[i, '*质保时间'], brand[1].loc[i, '标配外服务及配件']]

        # 对比产品型号关键字，相同则放入related_product
        tested_product=[]
        for i in tempo_dict:
            for j in tempo_dict:
                if i != j and set([i,j]) not in tested_product:
-                    if tempo_dict[i][0]==tempo_dict[j][0]:
+                    if tempo_dict[i][1:]==tempo_dict[j][1:]:
                        accuracy_i=0
                        accuracy_j=0
-                        for word_i in tempo_dict[i][1]:
-                            if word_i in tempo_dict[j][1]:
-                                accuracy_i += 1/(len(tempo_dict[i][1]) + a/len(tempo_dict[i][1]))
-                                accuracy_j += 1/(len(tempo_dict[j][1]) + a/len(tempo_dict[j][1]))
+                        for word_i in tempo_dict[i][0]:
+                            if word_i in tempo_dict[j][0]:
+                                accuracy_i += 1/(len(tempo_dict[i][0]) + a/len(tempo_dict[i][0]))
+                                accuracy_j += 1/(len(tempo_dict[j][0]) + a/len(tempo_dict[j][0]))
                        if accuracy_i >= thre or accuracy_j >= thre:
                            if not (df.loc[i,'产品型号'].endswith('+') ^ df.loc[j,'产品型号'].endswith('+')):
                                related_product.append(set([i,j]))

--- a/公共代码/扫描仪产品品牌分析.xlsx
+++ b/公共代码/扫描仪产品品牌分析.xlsx
--- a/公共代码/爬虫信息分析.py
+++ b/公共代码/爬虫信息分析.py
@@ -33,8 +33,8 @@ def pachong_washing(filepath):
        if num:
            num = num.group(1)  # 如果连续数字超过1处，我们只提取第一处，这个可能有点问题但目前还不知道怎么办
        if not num:  # 如果没有数字，就比较英文单词
-            num=re.findall(r'[A-Za-z]+', k)
-            num=''.join(num)
+            alpha=re.findall(r'[A-Za-z]+', k)
+            alpha=''.join(alpha)

        pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
                             k)  # pos_num为数字后的关键字
@@ -55,8 +55,8 @@ def pachong_washing(filepath):
        comparing_df.loc[i, '爬取数据pre_num'] = pre_num
        if num:
            comparing_df.loc[i, '爬取数据num'] = num
-        # if not num:
-        #     comparing_df.loc[i, '爬取数据alpha'] = alpha
+        if not num:
+            comparing_df.loc[i, '爬取数据alpha'] = alpha
        comparing_df.loc[i, '爬取数据pos_num'] = pos_num

    for i in df.index:
@@ -73,8 +73,8 @@ def pachong_washing(filepath):
        if num:
            num = num.group(1)  # 如果连续数字超过1处，我们只提取第一处，这个可能有点问题但目前还不知道怎么办
        if not num:  # 如果没有数字，就比较英文单词
-            num = re.findall(r'[A-Za-z]+', k)
-            num = ''.join(num)
+            alpha = re.findall(r'[A-Za-z]+', k)
+            alpha = ''.join(alpha)

        pos_num = re.findall(
            r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
@@ -106,10 +106,17 @@ def pachong_washing(filepath):
                    df.loc[i,col] = '暂无数据'
                continue

-        if comparing_df.loc[i,'补充型号num'] != comparing_df.loc[i,'爬取数据num']:
+        if comparing_df.loc[i,'补充型号num'] != comparing_df.loc[i,'爬取数据num']:  # 如果没有num，此处为nan, nan!=nan所以没问题
+            if type(comparing_df.loc[i,'补充型号num']) != float:
                    for col in df.columns[15:21]:
                        df.loc[i, col] = '暂无数据'
                    continue
+            else:
+                if comparing_df.loc[i, '补充型号alpha'] not in comparing_df.loc[i, '爬取数据alpha']:
+                    for col in df.columns[15:21]:
+                        df.loc[i, col] = '暂无数据'
+                    continue
+


        if comparing_df.loc[i,'补充型号pos_num'] != comparing_df.loc[i,'爬取数据pos_num']:
@@ -117,7 +124,7 @@ def pachong_washing(filepath):
                df.loc[i, col] = '暂无数据'
            continue

-    df.to_excel('./after_lijie.xlsx')
+    df.to_excel('./爬取数据分析.xlsx')


 if __name__ == '__main__':