Commit 9baca2a8 authored by LIANGZEYAN's avatar LIANGZEYAN

激光打印机型号提取

parent 2a685cbb
# coding:utf-8
import re
import pandas as pd
"""
Created on Tue May 25 14:56:22 2020
@author: SoreLemon
@Target: 适用于激光打印机型号提取.
@Input: <Line 211> This is the file path(usually be EXCEL file <xlsx>).
@Output: <Line 219> A list contains extracted model./<Line 220> 方便查看,也可生成两个excel(一个装提取到的型号和原数据,一个装没提取到型号的原数据).
"""
def laserprinter_model_extract(productName, productParams, brand):
model = ""
row = productName.replace('(', '(').replace(')', ')')
row = row.replace("A3","")
row = row.replace("A4","")
row1 = str(productParams).replace("'", "").replace(" ", "").replace("\n", "")
if len(re.findall(r"产品型号:[a-z|A-Z|0-9|-]+,", row1)) != 0:
if re.findall(r"产品型号:(.+?),", row1)[0] != "-":
model = (re.findall(r"产品型号:(.+?),", row1)[0])
else:
re_kuohao = r'\(.*?\)'
row = re.sub(re_kuohao, '', row)
model = (re.findall(r'[a-z|A-Z|0-9|-]+', row)[0])
elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)) != 0:
model = (re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)[0])
elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+[a-z|A-Z|]+', row)) != 0:
model = (re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+[a-z|A-Z|]+', row)[0])
elif len(re.findall("型号:[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+,", row1)) != 0:
model = (re.findall(r"型号:([a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+),", row1)[0])
elif len(re.findall("型号:[a-z|A-Z|0-9|-|+]+,", row1)) != 0:
model = (re.findall(r"型号:(.+?),", row1)[0])
elif len(re.findall(r'[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+', row)) != 0:
model = (re.findall(r'[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+', row)[0])
elif len(re.findall(r'[a-z|A-Z]+[0-9]+[0-9|a-z|A-Z|+]+', row)) != 0:
model = (re.findall(r'[a-z|A-Z]+[0-9]+[0-9|a-z|A-Z|+]+', row)[0])
elif len(re.findall(r"[a-z]+[0-9]+|[A-Z]+[0-9]+", row)) != 0:
if model == 'A3' or 'A4':
if len(re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)) != 0:
model = re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)[0]
else:
model = '暂无客户数据'
else:
model = (re.findall(r"[a-z]+[0-9]+|[A-Z]+[0-9]+", row)[0])
elif len(re.findall(r"[A-Z]+[0-9]+[A-Z]+", row1)) != 0:
model = (re.findall(r"[A-Z]+[0-9]+[A-Z]+", row1)[0])
elif len(re.findall(r"型号:(.+?),", row1)) != 0:
model = (re.findall(r"型号:(.+?),", row1)[0])
elif len(re.findall(r"型号:(.+?)}", row1.replace(":", ""))) != 0:
model = (re.findall(r"型号:(.+?)}", row1.replace(":", ""))[0])
elif len(re.findall(r"[A-Z|a-z| ]+", row)) != 0:
model = (re.findall(r"[A-Z|a-z| ]+", row)[0])
elif len(re.findall(r"[0-9]+[A-Z]+", row)) != 0:
model = (re.findall(r"[0-9]+[A-Z]+", row)[0])
else:
model = ('暂无客户数据')
#判断如果为全英文,则继续正则提取
if model.isalpha() is True:
if len(re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)) != 0:
model = re.findall(r"[0-9]+[a-z]+|[0-9]+[A-Z]+", row)[0]
else:
model = '暂无客户数据'
else:
#如果不是全英文,则判断是否为全中文,如果为全中文则数据无效
for _char in model:
if not '\u4e00' <= _char <= '\u9fa5':
break
else:
model = '暂无客户数据'
model = model.strip()
if model == "":
model = '暂无客户数据'
if model == '暂无客户数据':
if len(re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)) != 0:
model = re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)[0]
elif len(re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)) != 0:
model = re.findall(r'[a-z|A-Z]+\ [0-9]+\+', row)[0]
elif len(re.findall(r'[A-Z]+\ [0-9]+[0-9]+', row)) != 0:
model = re.findall(r'[A-Z]+\ [0-9]+[0-9]+', row)[0]
elif len(re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)) != 0:
model = re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)[0]
elif len(re.findall(r'[0-9]+\+', row)) != 0:
model = re.findall(r'[0-9]+\+', row)[0]
elif len(re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)) != 0:
model = re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)[0]
elif len(re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)) != 0:
model = re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)[0]
elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+', row)) != 0:
model = re.findall(r'[a-z|A-Z]+[0-9]+-[0-9]+', row)[0]
elif len(re.findall(r'[0-9]+[0-9]+[0-9]+[0-9]+', row)) != 0:
model = re.findall(r'[0-9]+[0-9]+[0-9]+[0-9]+', row)[0]
if model in ['2010','2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021'] :
model = "暂无客户数据"
else:
model = '暂无客户数据'
else:
model = model.strip()
if len(model)>11:
if len(re.findall(r'[a-z|A-Z]+-[0-9]+[a-z|A-Z]+', row)) != 0:
model = (re.findall(r'[a-z|A-Z]+-[0-9]+[a-z|A-Z]+', row)[0])
elif len(re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)) != 0:
model = (re.findall(r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+', row)[0])
elif len(re.findall(r'[a-z|A-Z]+-[0-9]+', row)) != 0:
model = (re.findall(r'[a-z|A-Z]+-[0-9]+', row)[0])
elif len(re.findall(r'[A-Z][0-9]+[A-Z|a-z]+', row)) != 0:
model = re.findall(r'[A-Z][0-9]+[A-Z|a-z]+', row)[0]
elif len(re.findall(r'[a-z|A-Z]+[0-9]+[a-z|A-Z]+', row)) != 0:
model = re.findall(r'[a-z|A-Z]+[0-9]+[a-z|A-Z]+', row)[0]
elif len(re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)) != 0:
model = re.findall(r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+', row)[0]
elif len(re.findall(r'[A-Z|a-z]+[0-9]+[0-9]+[0-9]+', row)) != 0:
model = re.findall(r'[A-Z|a-z]+[0-9]+[0-9]+[0-9]+', row)[0]
elif len(re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)) != 0:
model = re.findall(r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+', row)[0]
elif len(re.findall(r'[A-Z|a-z]+\ [0-9]+[0-9]+[0-9]+', row)) != 0:
model = re.findall(r'[A-Z|a-z]+\ [0-9]+[0-9]+[0-9]+', row)[0]
elif len(re.findall(r'[A-Z|a-z]+-[0-9]+[0-9]+[0-9]+', row)) != 0:
model = re.findall(r'[A-Z|a-z]+-[0-9]+[0-9]+[0-9]+', row)[0]
else:
model = '暂无客户数据'
model = model.replace(" ","")
return model
#帮助优化正则,找出没找到型号的产品的id
def find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list):
dict_model = {
"id": "model"
}
dict_name = {
"id": "name"
}
index_row = 0
model_find_number = 0
model_notfind_number = 0
model_find_name_list= []
model_find_list = []
model_notfind_name_list= []
model_notfind_list = []
for i in id_list:
dict_model[i] = model_extract_list[index_row]
dict_name[i] = sup_p_name_list[index_row]
index_row+=1
for j in id_list:
if '暂无客户数据' in dict_model.get(j):
model_notfind_number+=1
temp = dict_name.get(j)
model_notfind_name_list.append(dict_model.get(j))
model_notfind_list.append(temp)
else:
model_find_number+=1
temp = dict_name.get(j)
model_find_name_list.append(dict_model.get(j))
model_find_list.append(temp)
dict_notfind_model = {'未找到产品型号的产品名称': model_notfind_name_list,
'未找到产品型号的产品型号': model_notfind_list
}
dict_find_model = {'找到产品型号的产品名称': model_find_name_list,
'找到产品型号的产品型号': model_find_list
}
df_notfind = pd.DataFrame(dict_notfind_model, columns = ['未找到产品型号的产品名称', '未找到产品型号的产品型号'])
df_find = pd.DataFrame(dict_find_model, columns = ['找到产品型号的产品名称', '找到产品型号的产品型号'])
df_notfind.to_excel(r'激光打印机客户数据0511_妹找到型号.xlsx', index = False)
df_find.to_excel(r'激光打印机客户数据0511_找到了型号.xlsx', index = False)
print("找到型号的数量为")
print(model_find_number)
print("妹找到型号的数量为")
print(model_notfind_number)
df = pd.read_excel('激光打印机客户数据0511.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
#df = pd.read_excel('扫描仪数据_20210513.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
id_list = df['ID'].tolist()
sup_p_name_list = df['SUP_P_NAME'].tolist()
sup_p_params_list = df['SUP_P_PARAMS'].tolist()
zd_p_brand_name_list = df['ZD_P_BRAND_NAME'].tolist()
zd_p_lastcategory_name_list = df['ZD_P_LASTCATEGORY_NAME'].tolist()
model_extract_list = list(map(lambda x,y,m:laserprinter_model_extract(x,y,m),sup_p_name_list,sup_p_params_list,zd_p_brand_name_list))
#find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment