激光打印机型号提取

9baca2a8 · LIANGZEYAN · 2a685cbb · 9baca2a8
Commit 9baca2a8 authored May 25, 2021 by LIANGZEYAN
Hide whitespace changes
Inline Side-by-side

Showing with 221 additions and 0 deletions

激光打印机型号提取.py 公共代码/激光打印机型号提取.py +221 -0

No files found.
--- a/公共代码/激光打印机型号提取.py
+++ b/公共代码/激光打印机型号提取.py
+# coding:utf-8
+import re
+import pandas as pd
+"""
+Created on Tue May 25 14:56:22 2020
+@author: SoreLemon
+@Target: 适用于激光打印机型号提取.
+@Input: <Line 211> This is the file path(usually be EXCEL file <xlsx>).
+@Output: <Line 219> A list contains extracted model./<Line 220> 方便查看，也可生成两个excel(一个装提取到的型号和原数据，一个装没提取到型号的原数据).
+"""
+def laserprinter_model_extract(productName, productParams, brand):
+    model = ""
+    row = productName.replace('（', '(').replace('）', ')')
+    row = row.replace("A3","")
+    row = row.replace("A4","")
+    row1 = str(productParams).replace("'", "").replace(" ", "").replace("\n", "")
+    if len(re.findall(r"产品型号:[a-z|A-Z|0-9|-]+,", row1)) != 0:
+        if re.findall(r"产品型号:(.+?),", row1)[0] != "-":
+            model = (re.findall(r"产品型号:(.+?),", row1)[0])
+        else:
+            re_kuohao = r'\(.*?\)'
+            row = re.sub(re_kuohao, '', row)
+            model = (re.findall(r'[a-z|A-Z|0-9|-]+', row)[0])
+    elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)) != 0:
+        model = (re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)[0])
+    elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+[a-z|A-Z|]+', row)) != 0:
+        model = (re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+[a-z|A-Z|]+', row)[0])
+    elif len(re.findall("型号:[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+,", row1)) != 0:
+        model = (re.findall(r"型号:([a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+),", row1)[0])
+    elif len(re.findall("型号:[a-z|A-Z|0-9|-|+]+,", row1)) != 0:
+        model = (re.findall(r"型号:(.+?),", row1)[0])
+    elif len(re.findall(r'[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+', row)) != 0:
+        model = (re.findall(r'[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+', row)[0])
+    elif len(re.findall(r'[a-z|A-Z]+[0-9]+[0-9|a-z|A-Z|+]+', row)) != 0:
+        model = (re.findall(r'[a-z|A-Z]+[0-9]+[0-9|a-z|A-Z|+]+', row)[0])
+    elif len(re.findall(r"[a-z]+[0-9]+|[A-Z]+[0-9]+", row)) != 0:
+        if model == 'A3' or 'A4':
+            if len(re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)) != 0:
+                model = re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)[0]
+            else:
+                model = '暂无客户数据'
+        else:
+            model = (re.findall(r"[a-z]+[0-9]+|[A-Z]+[0-9]+", row)[0])
+    elif len(re.findall(r"[A-Z]+[0-9]+[A-Z]+", row1)) != 0:
+        model = (re.findall(r"[A-Z]+[0-9]+[A-Z]+", row1)[0])
+    elif len(re.findall(r"型号:(.+?),", row1)) != 0:
+        model = (re.findall(r"型号:(.+?),", row1)[0])
+    elif len(re.findall(r"型号:(.+?)}", row1.replace("：", ""))) != 0:
+        model = (re.findall(r"型号:(.+?)}", row1.replace("：", ""))[0])
+    elif len(re.findall(r"[A-Z|a-z| ]+", row)) != 0:
+        model = (re.findall(r"[A-Z|a-z| ]+", row)[0])
+    elif len(re.findall(r"[0-9]+[A-Z]+", row)) != 0:
+        model = (re.findall(r"[0-9]+[A-Z]+", row)[0])
+    else:
+        model = ('暂无客户数据')
+    #判断如果为全英文，则继续正则提取
+    if model.isalpha() is True:
+        if len(re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)) != 0:
+            model = re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)[0]
+        else:
+            model = '暂无客户数据'
+    else:
+        #如果不是全英文，则判断是否为全中文，如果为全中文则数据无效
+        for _char in model:
+            if not '\u4e00' <= _char <= '\u9fa5':
+                break
+            else:
+                model = '暂无客户数据'
+    model = model.strip()
+    if model == "":
+        model = '暂无客户数据'
+    if model == '暂无客户数据':
+        if len(re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)) != 0:
+            model = re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)[0]
+        elif len(re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)) != 0:
+            model = re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)[0]
+        elif len(re.findall(r'[A-Z]+\ [0-9]+[0-9]+', row)) != 0:
+            model = re.findall(r'[A-Z]+\ [0-9]+[0-9]+', row)[0]
+        elif len(re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)) != 0:
+            model = re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)[0]
+        elif len(re.findall(r'[0-9]+\+', row)) != 0:
+            model = re.findall(r'[0-9]+\+', row)[0]
+        elif len(re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)) != 0:
+            model = re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)[0]
+        elif len(re.findall(r'[0-9]+[0-9]+[0-9]+\  [A-Z|a-z]+', row)) != 0:
+            model = re.findall(r'[0-9]+[0-9]+[0-9]+\  [A-Z|a-z]+', row)[0]
+        elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+', row)) != 0:
+            model = re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+', row)[0]
+        elif len(re.findall(r'[0-9]+[0-9]+[0-9]+[0-9]+', row)) != 0:
+            model = re.findall(r'[0-9]+[0-9]+[0-9]+[0-9]+', row)[0]
+            if model in ['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021'] :
+                model = "暂无客户数据"
+        else:
+            model = '暂无客户数据'
+    else:
+        model = model.strip()
+        if len(model)>11:
+            if len(re.findall(r'[a-z|A-Z]+-[0-9]+[a-z|A-Z]+', row)) != 0:
+                model = (re.findall(r'[a-z|A-Z]+-[0-9]+[a-z|A-Z]+', row)[0])
+            elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)) != 0:
+                model = (re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)[0])
+            elif len(re.findall(r'[a-z|A-Z]+-[0-9]+', row)) != 0:
+                model = (re.findall(r'[a-z|A-Z]+-[0-9]+', row)[0])
+            elif len(re.findall(r'[A-Z][0-9]+[A-Z|a-z]+', row)) != 0:
+                model = re.findall(r'[A-Z][0-9]+[A-Z|a-z]+', row)[0]
+            elif len(re.findall(r'[a-z|A-Z]+[0-9]+[a-z|A-Z]+', row)) != 0:
+                model = re.findall(r'[a-z|A-Z]+[0-9]+[a-z|A-Z]+', row)[0]
+            elif len(re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)) != 0:
+                model = re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)[0]
+            elif len(re.findall(r'[A-Z|a-z]+[0-9]+[0-9]+[0-9]+', row)) != 0:
+                model = re.findall(r'[A-Z|a-z]+[0-9]+[0-9]+[0-9]+', row)[0]
+            elif len(re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)) != 0:
+                model = re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)[0]
+            elif len(re.findall(r'[A-Z|a-z]+\ [0-9]+[0-9]+[0-9]+', row)) != 0:
+                model = re.findall(r'[A-Z|a-z]+\ [0-9]+[0-9]+[0-9]+', row)[0]
+            elif len(re.findall(r'[A-Z|a-z]+-[0-9]+[0-9]+[0-9]+', row)) != 0:
+                model = re.findall(r'[A-Z|a-z]+-[0-9]+[0-9]+[0-9]+', row)[0]
+            else:
+                model = '暂无客户数据'
+    model = model.replace(" ","")
+    return model
+#帮助优化正则，找出没找到型号的产品的id
+def find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list):
+    dict_model = {
+        "id": "model"
+    }
+    dict_name = {
+        "id": "name"
+    }
+    index_row = 0
+    model_find_number = 0
+    model_notfind_number = 0
+    model_find_name_list= []
+    model_find_list = []
+    model_notfind_name_list= []
+    model_notfind_list = []
+    for i in id_list:
+        dict_model[i] = model_extract_list[index_row]
+        dict_name[i] = sup_p_name_list[index_row]
+        index_row+=1
+    for j in id_list:
+        if '暂无客户数据' in dict_model.get(j):
+            model_notfind_number+=1
+            temp = dict_name.get(j)
+            model_notfind_name_list.append(dict_model.get(j))
+            model_notfind_list.append(temp)
+        else:
+            model_find_number+=1
+            temp = dict_name.get(j)
+            model_find_name_list.append(dict_model.get(j))
+            model_find_list.append(temp)
+    dict_notfind_model = {'未找到产品型号的产品名称': model_notfind_name_list,
+        '未找到产品型号的产品型号': model_notfind_list
+        }
+    dict_find_model = {'找到产品型号的产品名称': model_find_name_list,
+        '找到产品型号的产品型号': model_find_list
+        }
+    df_notfind = pd.DataFrame(dict_notfind_model, columns = ['未找到产品型号的产品名称', '未找到产品型号的产品型号'])
+    df_find = pd.DataFrame(dict_find_model, columns = ['找到产品型号的产品名称', '找到产品型号的产品型号'])
+    df_notfind.to_excel(r'激光打印机客户数据0511_妹找到型号.xlsx', index = False)
+    df_find.to_excel(r'激光打印机客户数据0511_找到了型号.xlsx', index = False)
+    print("找到型号的数量为")
+    print(model_find_number)
+    print("妹找到型号的数量为")
+    print(model_notfind_number)
+df = pd.read_excel('激光打印机客户数据0511.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
+#df = pd.read_excel('扫描仪数据_20210513.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
+id_list = df['ID'].tolist()
+sup_p_name_list = df['SUP_P_NAME'].tolist()
+sup_p_params_list = df['SUP_P_PARAMS'].tolist()
+zd_p_brand_name_list = df['ZD_P_BRAND_NAME'].tolist()
+zd_p_lastcategory_name_list = df['ZD_P_LASTCATEGORY_NAME'].tolist()
+model_extract_list = list(map(lambda x,y,m:laserprinter_model_extract(x,y,m),sup_p_name_list,sup_p_params_list,zd_p_brand_name_list))
+#find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list)
\ No newline at end of file