Commit d0dfca0f authored by LIANGZEYAN's avatar LIANGZEYAN

激光打印机参数提取+修改命名简称获取方式

parent 682d6568
......@@ -10,12 +10,31 @@ import pymssql
import pandas as pd
from public import Index
import uuid
import ast
def get_simple_value():
conn_zi_new = pymssql.connect(host = '123.56.115.207', user = 'zgcprice3311', password = 'zgcprice20200628', database = 'ZI_NEW', autocommit = True)
cursor = conn_zi_new.cursor()
cursor.execute(f"select tip from skuname_named_rule")
data = (cursor.fetchall())
export_category = pd.DataFrame(data,columns = [tuple[0] for tuple in cursor.description])
tip_list = export_category["tip"].tolist()
temp = []
for i in tip_list:
i = ast.literal_eval(i)
temp = list(set(temp+i))
conn_zi_new.close()
return temp
def transform_simplevalue(cursor_zi_new,shujuzidiandf,categoryname,subtitle,stdvalue):
stdvalue = stdvalue.strip()
simple_subtitle_list = ['CPU型号','显存容量','操作系统','双面器','双面输稿器','网络打印','标配外服务及配件','标配外耗材','镜头描述','碎纸效果','产品尺寸','分辨率','是否含壁挂架','是否含底座','CPU','内存','硬盘','尺寸','容量','最大读取速度','颜色','最大容积L','总容积L','类别','内存容量','硬盘容量','操作系统','网络连接','屏幕尺寸','硬盘尺寸','容量','类型']
#simple_subtitle_list = ['CPU型号','显存容量','操作系统','双面器','双面输稿器','网络打印','标配外服务及配件','标配外耗材','镜头描述','碎纸效果','产品尺寸','分辨率','是否含壁挂架','是否含底座','CPU','内存','硬盘','尺寸','容量','最大读取速度','颜色','最大容积L','总容积L','类别','内存容量','硬盘容量','操作系统','网络连接','屏幕尺寸','硬盘尺寸','容量','类型']
simple_subtitle_list = get_simple_value()
if subtitle not in simple_subtitle_list:
return stdvalue
......
# coding:utf-8
import re
import time
import pandas as pd
from public import Index
import pymssql
"""
Created on Tue May 25 14:56:22 2020
@author: SoreLemon
@Target: 适用于激光打印机型号提取.
@Input: <Line 211> This is the file path(usually be EXCEL file <xlsx>).
@Output: <Line 219> A list contains extracted model./<Line 220> 方便查看,也可生成两个excel(一个装提取到的型号和原数据,一个装没提取到型号的原数据).
@author: Jialin.Li, Zeyan.Liang
@Target: 适用于激光打印机型号和参数提取.
@Input: <Line 212> This is the file path(usually be EXCEL file <xlsx>).
@Output: <Line 221> A list contains extracted model./<Line 220> 方便查看,也可生成两个excel(一个装提取到的型号和原数据,一个装没提取到型号的原数据).
"""
def laserprinter_model_extract(productName, productParams, brand):
......@@ -200,21 +203,334 @@ def find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list):
}
df_notfind = pd.DataFrame(dict_notfind_model, columns = ['未找到产品型号的产品名称', '未找到产品型号的产品型号'])
df_find = pd.DataFrame(dict_find_model, columns = ['找到产品型号的产品名称', '找到产品型号的产品型号'])
df_notfind.to_excel(r'激光打印机客户数据0511_妹找到型号.xlsx', index = False)
df_find.to_excel(r'激光打印机客户数据0511_找到了型号.xlsx', index = False)
writer = pd.ExcelWriter(f"激光打印机客户数据0511_参数提取.xlsx")
df_notfind.to_excel(writer,f'未找到型号')
df_find.to_excel(writer,f'找到了型号')
writer.save()
writer.close()
print("找到型号的数量为")
print(model_find_number)
print("找到型号的数量为")
print("找到型号的数量为")
print(model_notfind_number)
def 激光打印机参数提取(id, brand_Name, productName, productParams, requier_param_list):
try:
productParams = productParams.replace('/n', '').replace("'",'"').replace(r"\t",' ').replace(r"\n",' ')
temp_dict={}
for param in requier_param_list:
result = re.findall(r'%s.?\W?\W?:\W?"(.*?)"' % param, productParams)
if result:
if result[0].strip().strip(',').strip('-').strip('·')=='':
temp_dict[param] = '原始数据缺失'
else:
temp_dict[param] = result[0].replace('"', '').strip()
info = productParams + productName
# print('尝试获取产品类型')
#尝试获取产品类型
if not temp_dict.get('产品类型'):
result = re.findall(r'彩色(?!打印速度)',info)
if result:
temp_dict['产品类型'] = '彩色'
else:
result = re.findall(r'黑白(?!打印速度)', info)
if result:
temp_dict['产品类型'] = '黑白'
else:
temp_dict['产品类型'] = ''
# print('尝试获取最大打印幅面')
# 尝试获取最大打印幅面
if not temp_dict.get('最大打印幅面'):
result = re.findall(r'[Aa]3',info)
if result:
temp_dict['最大打印幅面'] = 'A3'
else:
result = re.findall(r'[Aa]4', info)
if result:
temp_dict['最大打印幅面'] = 'A4'
else:
temp_dict['最大打印幅面'] = ''
# print('尝试获取彩色打印速度')
# 尝试获取彩色打印速度
if not temp_dict.get('彩色打印速度'):
result = re.findall(r'(?:彩色打印速度\(页/分钟ppm\)|打印速度|打印速度(黑白/彩色)\(页/分钟\))\W?\W?:\W?\W?"(.*?)"', info)
if result:
if result[0].strip().strip(',')=='':
temp_dict['彩色打印速度'] = '原始数据缺失'
else:
temp_dict['彩色打印速度'] = result[0]
else:
result = re.findall(r'(彩色打印速度|彩色(?!激光打印机)|打印速度)(?:.*?)(?(1)'
r'([0-9.]+\W?\W?\W?\W?\W?(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))|'
r'(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))\W?\W?\W?\W?\W?[0-9.]+))', info)
if result:
temp_dict['彩色打印速度'] = result[0][1]
else:
temp_dict['彩色打印速度'] = ''
# print('尝试获取黑白打印速度')
# 尝试获取黑白打印速度
if not temp_dict.get('黑白打印速度'):
result = re.findall(r'(?:黑白打印速度\(页/分钟ppm\)|打印速度|打印速度(黑白/彩色)\(页/分钟\))\W?\W?:\W?\W?"(.*?)"', info)
if result:
if result[0].strip().strip(',')=='':
temp_dict['黑白打印速度'] = '原始数据缺失'
else:
temp_dict['黑白打印速度'] = result[0]
else:
result = re.findall(r'(黑白打印速度|黑白(?!激光打印机)|打印速度)(?:.*?)(?(1)'
r'([0-9.]+\W?\W?\W?\W?\W?(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))|'
r'(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))\W?\W?\W?\W?\W?[0-9.]+))', info)
if result:
temp_dict['黑白打印速度'] = result[0][1]
else:
temp_dict['黑白打印速度'] = ''
# print('尝试获取打印分辨率')
# 尝试获取打印分辨率
if not temp_dict.get('打印分辨率'):
result = re.findall(r'"分辨率.?"\W?\W?:\W?\W?"(.*?)"', info) #这个和其他的不一样,因为分辨率前面可能有别的,比如扫描分辨率,所以前面要有双引号
if result and len(set(result))==1:
if result[0].strip().strip(',')== '':
temp_dict['打印分辨率'] = '原始数据缺失'
else:
temp_dict['打印分辨率'] = result[0].replace('"', '').strip()
# elif result:
# temp_dict['打印分辨率'] = '|||'.join(result).replace('"', '').strip()
# 打印分辨率_flag_list.append(-100)
if not temp_dict.get('打印分辨率'):
result = re.findall(r'(?:打印分辨率(垂直)|打印分辨率(水平)).?"\W?\W?:\W?\W?"(.*?)"', info)
if result:
mid_result='*'.join(result)
temp_dict['打印分辨率'] = mid_result
if not temp_dict.get('打印分辨率'):
result = re.findall(r'([0-9.]+)\W?\W?\W?(?:dpi|DPI)?\W?([×xX*])\W?([0-9.]+)\W?\W?\W?(dpi|DPI)', info)
if result and len(set(result))==1:
temp_dict['打印分辨率'] = ''.join(result[0])
# elif result:
# mid_result=''
# for item in result:
# mid_result += ''.join(item) + '|||'
# temp_dict['打印分辨率'] = mid_result.replace('"', '').strip()
# 打印分辨率_flag_list.append(-1)
else:
temp_dict['打印分辨率'] = ''
# print('尝试获取进纸盒容量')
# 尝试获取进纸盒容量
if not temp_dict.get('进纸盒容量'):
result = re.findall(r'(?:纸张容量|供纸盒容量|标配进纸盒)\W?\W?:\W?\W?"(.*?)"',info)
if result:
if result[0].strip().strip(',') == '':
temp_dict['进纸盒容量'] = '原始数据缺失'
else:
temp_dict['进纸盒容量'] = result[0].replace('"', '').strip()
if not temp_dict.get('进纸盒容量'):
result = re.findall(r'(?:.{0,15})?进纸盒(?:.{0,15})?', info)
if result:
temp_info = ''.join(result)
mid_result=re.findall(r'([0-9,]+页)(?!\W?/分钟|\W?每分钟|\W?速度|\W?/分|\W?每分|\W?/秒|\W?每秒|\W?/秒钟|\W?每秒钟|\)|)|多用途|出纸盒)', temp_info)
number_list = []
for item in mid_result:
number = int(item.replace(',', '').strip('页'))
if number >= 100 and number <= 10000:
number_list.append(str(number) + '页')
if number_list and len(set(number_list)) == 1:
temp_dict['进纸盒容量'] = number_list[0]
elif number_list:
mid_result = re.findall(r'([0-9,]+页)进纸盒', temp_info)
if mid_result:
temp_dict['进纸盒容量'] = result[0]
if not temp_dict.get('进纸盒容量'):
result = re.findall(r'([0-9,]+页)(?!\W?/分钟|\W?每分钟|\W?速度|\W?/分|\W?每分|\W?/秒|\W?每秒|\W?/秒钟|\W?每秒钟|\)|)|多用途|出纸盒)', info)
number_list=[]
for item in result:
number=int(item.replace(',', '').strip('页'))
if number >=100 and number <=10000:
number_list.append(str(number)+'页')
if number_list and len(set(number_list))==1:
temp_dict['进纸盒容量'] = number_list[0]
# elif number_list:
# temp_dict['进纸盒容量'] = '|||'.join(number_list).replace('"', '').strip()
# 进纸盒容量_flag_list.append(-1)
else:
temp_dict['进纸盒容量'] = ''
# print('尝试获取网络打印')
# 尝试获取网络打印
if not temp_dict.get('网络打印'):
result = re.findall(r'(?:无线功能|打印方式).?\W?\W?\W?:\W?\W?"(.*?)"', info)
if result:
if result[0].strip().strip(',') == '':
temp_dict['网络打印'] = '原始数据缺失'
else:
temp_dict['网络打印'] = result[0].replace('"', '').strip()
if not temp_dict.get('网络打印'):
result = re.search(r'(无线/有线|有线/无线|有线|无线|支持|不支持)?\W?\W?\W?\W?(网络打印)\W?\W?\W?\W?(无线/有线|有线/无线|有线|无线|支持|不支持)?',info)
if result:
if result.group(1):
temp_dict['网络打印'] = result.group(1)+'网络打印'
elif result.group(3):
temp_dict['网络打印'] = '网络打印'+result.group(3)
else:
temp_dict['网络打印'] = '支持网络打印'
else:
temp_dict['网络打印'] = ''
# print('尝试获取双面打印')
# 尝试获取双面打印
if not temp_dict.get('双面打印'):
result = re.search(r'(自动|手动|不支持|支持)?\W?\W?\W?\W?\W?(双面打印|双面)', info)
if result:
if result.group(1):
temp_dict['双面打印'] = result.group(1)+'双面打印'
else:
temp_dict['双面打印'] = '支持双面打印'
else:
temp_dict['双面打印'] = ''
# print('尝试获取节能证书编号')
# 尝试获取节能证书编号
if not temp_dict.get('节能证书编号'):
result = re.findall(r'(?:CQC|cqc)\W?[0-9]{11}', info)
if result and len(set(result))==1:
temp_dict['节能证书编号'] = result[0]
elif result:
mid_result = re.findall(r'节能编号\W?\W?\W?\W?((?:CQC|cqc)\W?[0-9]{11})', info)
if mid_result:
temp_dict['节能证书编号'] = mid_result[0]
else:
temp_dict['节能证书编号'] = ''
# 尝试获取质保时间
# print('尝试获取质保时间')
if not temp_dict.get('质保时间'):
result = re.findall(r'(?:.{0,15})?(?:质保|年保)(?:.{0,15})?', info)
if result:
temp_info=''.join(result).replace("'","").replace('"','')
result = re.findall(r'(?<![0-9]{3})[1-9]年|[一二三四五六七八九两]年', temp_info)
number_list=[]
for item in result:
number_list.append(item.replace('一','1').replace('二','2').replace('两','2').replace('三','3').replace('四','4').replace('五','5').replace('六','6').replace('七','7').replace('八','8').replace('九','9'))
if number_list and len(set(number_list))==1:
temp_dict['质保时间'] = number_list[0]
else:
result = re.findall(r'(?<![0-9]{3})[1-9]年|[一二三四五六七八九两]年', info)
number_list = []
for item in result:
number_list.append(
item.replace('一', '1').replace('二', '2').replace('两', '2').replace('三', '3').replace('四',
'4').replace(
'五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9'))
if number_list and len(set(number_list)) == 1:
temp_dict['质保时间'] = number_list[0]
else:
temp_dict['质保时间'] = ''
msg = '激光打印机参数提取成功'
my_dict={id:temp_dict}
return msg, my_dict
except Exception as e:
msg= '激光打印机参数提取失败' + e
return msg, {}
df = pd.read_excel('激光打印机客户数据0511.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
#df = pd.read_excel('扫描仪数据_20210513.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
id_list = df['ID'].tolist()
sup_p_name_list = df['SUP_P_NAME'].tolist()
sup_p_params_list = df['SUP_P_PARAMS'].tolist()
zd_p_brand_name_list = df['ZD_P_BRAND_NAME'].tolist()
zd_p_lastcategory_name_list = df['ZD_P_LASTCATEGORY_NAME'].tolist()
'''start_time = time.time()
model_extract_list = list(map(lambda x,y,m:laserprinter_model_extract(x,y,m),sup_p_name_list,sup_p_params_list,zd_p_brand_name_list))
#find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list)
\ No newline at end of file
find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list)
end_time = time.time()
print(f'The runing time is {end_time - start_time} s')'''
if __name__ == '__main__':
conn_zi_new = pymssql.connect(host='39.107.254.235', user='sa', password='1qaz@WSX', database='ZD_DW_dev',
autocommit=True)
cursor = conn_zi_new.cursor()
cursor.execute("select ID, SUP_P_NAME, SUP_P_PARAMS from DW_PRODUCT_ALL where SUP_P_LASTCATEGORY_NAME = '激光打印机'")
data_df=pd.DataFrame(cursor.fetchall(), columns=[tuple[0] for tuple in cursor.description])
id_list=data_df['ID'].tolist()
name_list=data_df['SUP_P_NAME'].tolist()
param_list=data_df['SUP_P_PARAMS'].tolist()
brand_Name='随便'
requier_param_list=['产品类型','最大打印幅面','彩色打印速度','黑白打印速度','打印分辨率','进纸盒容量','网络打印','双面打印','节能证书编号','质保时间']
for i in requier_param_list:
exec(f'{i}_list=[]')
# exec(f'{i}_flag_list=[]')
t1 = time.time()
for id, name, param in zip(id_list, name_list, param_list):
_, temp_dict2 = 激光打印机参数提取(id, brand_Name, name, param, requier_param_list)
for i in requier_param_list:
# try:
exec(f"{i}_list.append('{temp_dict2[id][i]}')")
# except Exception as e:
# print(temp_dict2[id][i],e)
# break
t2=time.time()
df_output=pd.DataFrame(id_list)
# print(产品类型_list)
for i in requier_param_list:
exec(f"df_output[i]={i}_list")
# exec(f"df_output['{i}_flag']={i}_flag_list")
df_output['sup_p_name']=name_list
df_output['sup_p_param'] = param_list
writer = pd.ExcelWriter(f"激光打印机客户数据0511_参数提取.xlsx")
df_output.to_excel(writer,f'参数提取')
writer.save()
writer.close()
print(t2-t1)
#标配外服务及配件,标配外耗材,
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment