Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
d0dfca0f
Commit
d0dfca0f
authored
Jun 03, 2021
by
LIANGZEYAN
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
激光打印机参数提取+修改命名简称获取方式
parent
682d6568
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
346 additions
and
11 deletions
+346
-11
更新库内产品数据.py
公共代码/更新库内产品数据.py
+20
-1
激光打印机型号提取.py
公共代码/激光打印机型号提取.py
+326
-10
No files found.
公共代码/更新库内产品数据.py
View file @
d0dfca0f
...
...
@@ -10,12 +10,31 @@ import pymssql
import
pandas
as
pd
from
public
import
Index
import
uuid
import
ast
def
get_simple_value
():
conn_zi_new
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'zgcprice20200628'
,
database
=
'ZI_NEW'
,
autocommit
=
True
)
cursor
=
conn_zi_new
.
cursor
()
cursor
.
execute
(
f
"select tip from skuname_named_rule"
)
data
=
(
cursor
.
fetchall
())
export_category
=
pd
.
DataFrame
(
data
,
columns
=
[
tuple
[
0
]
for
tuple
in
cursor
.
description
])
tip_list
=
export_category
[
"tip"
]
.
tolist
()
temp
=
[]
for
i
in
tip_list
:
i
=
ast
.
literal_eval
(
i
)
temp
=
list
(
set
(
temp
+
i
))
conn_zi_new
.
close
()
return
temp
def
transform_simplevalue
(
cursor_zi_new
,
shujuzidiandf
,
categoryname
,
subtitle
,
stdvalue
):
stdvalue
=
stdvalue
.
strip
()
simple_subtitle_list
=
[
'CPU型号'
,
'显存容量'
,
'操作系统'
,
'双面器'
,
'双面输稿器'
,
'网络打印'
,
'标配外服务及配件'
,
'标配外耗材'
,
'镜头描述'
,
'碎纸效果'
,
'产品尺寸'
,
'分辨率'
,
'是否含壁挂架'
,
'是否含底座'
,
'CPU'
,
'内存'
,
'硬盘'
,
'尺寸'
,
'容量'
,
'最大读取速度'
,
'颜色'
,
'最大容积L'
,
'总容积L'
,
'类别'
,
'内存容量'
,
'硬盘容量'
,
'操作系统'
,
'网络连接'
,
'屏幕尺寸'
,
'硬盘尺寸'
,
'容量'
,
'类型'
]
#simple_subtitle_list = ['CPU型号','显存容量','操作系统','双面器','双面输稿器','网络打印','标配外服务及配件','标配外耗材','镜头描述','碎纸效果','产品尺寸','分辨率','是否含壁挂架','是否含底座','CPU','内存','硬盘','尺寸','容量','最大读取速度','颜色','最大容积L','总容积L','类别','内存容量','硬盘容量','操作系统','网络连接','屏幕尺寸','硬盘尺寸','容量','类型']
simple_subtitle_list
=
get_simple_value
()
if
subtitle
not
in
simple_subtitle_list
:
return
stdvalue
...
...
公共代码/激光打印机型号提取.py
View file @
d0dfca0f
# coding:utf-8
import
re
import
time
import
pandas
as
pd
from
public
import
Index
import
pymssql
"""
Created on Tue May 25 14:56:22 2020
@author:
SoreLemon
@Target: 适用于激光打印机型号提取.
@Input: <Line 21
1
> This is the file path(usually be EXCEL file <xlsx>).
@Output: <Line 2
19
> A list contains extracted model./<Line 220> 方便查看,也可生成两个excel(一个装提取到的型号和原数据,一个装没提取到型号的原数据).
@author:
Jialin.Li, Zeyan.Liang
@Target: 适用于激光打印机型号
和参数
提取.
@Input: <Line 21
2
> This is the file path(usually be EXCEL file <xlsx>).
@Output: <Line 2
21
> A list contains extracted model./<Line 220> 方便查看,也可生成两个excel(一个装提取到的型号和原数据,一个装没提取到型号的原数据).
"""
def
laserprinter_model_extract
(
productName
,
productParams
,
brand
):
...
...
@@ -200,21 +203,334 @@ def find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list):
}
df_notfind
=
pd
.
DataFrame
(
dict_notfind_model
,
columns
=
[
'未找到产品型号的产品名称'
,
'未找到产品型号的产品型号'
])
df_find
=
pd
.
DataFrame
(
dict_find_model
,
columns
=
[
'找到产品型号的产品名称'
,
'找到产品型号的产品型号'
])
df_notfind
.
to_excel
(
r'激光打印机客户数据0511_妹找到型号.xlsx'
,
index
=
False
)
df_find
.
to_excel
(
r'激光打印机客户数据0511_找到了型号.xlsx'
,
index
=
False
)
writer
=
pd
.
ExcelWriter
(
f
"激光打印机客户数据0511_参数提取.xlsx"
)
df_notfind
.
to_excel
(
writer
,
f
'未找到型号'
)
df_find
.
to_excel
(
writer
,
f
'找到了型号'
)
writer
.
save
()
writer
.
close
()
print
(
"找到型号的数量为"
)
print
(
model_find_number
)
print
(
"
妹
找到型号的数量为"
)
print
(
"
未
找到型号的数量为"
)
print
(
model_notfind_number
)
def
激光打印机参数提取(
id
,
brand_Name
,
productName
,
productParams
,
requier_param_list
):
try
:
productParams
=
productParams
.
replace
(
'/n'
,
''
)
.
replace
(
"'"
,
'"'
)
.
replace
(
r"\t"
,
' '
)
.
replace
(
r"\n"
,
' '
)
temp_dict
=
{}
for
param
in
requier_param_list
:
result
=
re
.
findall
(
r'
%
s.?\W?\W?:\W?"(.*?)"'
%
param
,
productParams
)
if
result
:
if
result
[
0
]
.
strip
()
.
strip
(
','
)
.
strip
(
'-'
)
.
strip
(
'·'
)
==
''
:
temp_dict
[
param
]
=
'原始数据缺失'
else
:
temp_dict
[
param
]
=
result
[
0
]
.
replace
(
'"'
,
''
)
.
strip
()
info
=
productParams
+
productName
# print('尝试获取产品类型')
#尝试获取产品类型
if
not
temp_dict
.
get
(
'产品类型'
):
result
=
re
.
findall
(
r'彩色(?!打印速度)'
,
info
)
if
result
:
temp_dict
[
'产品类型'
]
=
'彩色'
else
:
result
=
re
.
findall
(
r'黑白(?!打印速度)'
,
info
)
if
result
:
temp_dict
[
'产品类型'
]
=
'黑白'
else
:
temp_dict
[
'产品类型'
]
=
''
# print('尝试获取最大打印幅面')
# 尝试获取最大打印幅面
if
not
temp_dict
.
get
(
'最大打印幅面'
):
result
=
re
.
findall
(
r'[Aa]3'
,
info
)
if
result
:
temp_dict
[
'最大打印幅面'
]
=
'A3'
else
:
result
=
re
.
findall
(
r'[Aa]4'
,
info
)
if
result
:
temp_dict
[
'最大打印幅面'
]
=
'A4'
else
:
temp_dict
[
'最大打印幅面'
]
=
''
# print('尝试获取彩色打印速度')
# 尝试获取彩色打印速度
if
not
temp_dict
.
get
(
'彩色打印速度'
):
result
=
re
.
findall
(
r'(?:彩色打印速度\(页/分钟ppm\)|打印速度|打印速度(黑白/彩色)\(页/分钟\))\W?\W?:\W?\W?"(.*?)"'
,
info
)
if
result
:
if
result
[
0
]
.
strip
()
.
strip
(
','
)
==
''
:
temp_dict
[
'彩色打印速度'
]
=
'原始数据缺失'
else
:
temp_dict
[
'彩色打印速度'
]
=
result
[
0
]
else
:
result
=
re
.
findall
(
r'(彩色打印速度|彩色(?!激光打印机)|打印速度)(?:.*?)(?(1)'
r'([0-9.]+\W?\W?\W?\W?\W?(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))|'
r'(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))\W?\W?\W?\W?\W?[0-9.]+))'
,
info
)
if
result
:
temp_dict
[
'彩色打印速度'
]
=
result
[
0
][
1
]
else
:
temp_dict
[
'彩色打印速度'
]
=
''
# print('尝试获取黑白打印速度')
# 尝试获取黑白打印速度
if
not
temp_dict
.
get
(
'黑白打印速度'
):
result
=
re
.
findall
(
r'(?:黑白打印速度\(页/分钟ppm\)|打印速度|打印速度(黑白/彩色)\(页/分钟\))\W?\W?:\W?\W?"(.*?)"'
,
info
)
if
result
:
if
result
[
0
]
.
strip
()
.
strip
(
','
)
==
''
:
temp_dict
[
'黑白打印速度'
]
=
'原始数据缺失'
else
:
temp_dict
[
'黑白打印速度'
]
=
result
[
0
]
else
:
result
=
re
.
findall
(
r'(黑白打印速度|黑白(?!激光打印机)|打印速度)(?:.*?)(?(1)'
r'([0-9.]+\W?\W?\W?\W?\W?(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))|'
r'(?:ppm|PPM|(?:页|面|张)?(?:/|每|一)(?:分钟|分|秒钟|秒))\W?\W?\W?\W?\W?[0-9.]+))'
,
info
)
if
result
:
temp_dict
[
'黑白打印速度'
]
=
result
[
0
][
1
]
else
:
temp_dict
[
'黑白打印速度'
]
=
''
# print('尝试获取打印分辨率')
# 尝试获取打印分辨率
if
not
temp_dict
.
get
(
'打印分辨率'
):
result
=
re
.
findall
(
r'"分辨率.?"\W?\W?:\W?\W?"(.*?)"'
,
info
)
#这个和其他的不一样,因为分辨率前面可能有别的,比如扫描分辨率,所以前面要有双引号
if
result
and
len
(
set
(
result
))
==
1
:
if
result
[
0
]
.
strip
()
.
strip
(
','
)
==
''
:
temp_dict
[
'打印分辨率'
]
=
'原始数据缺失'
else
:
temp_dict
[
'打印分辨率'
]
=
result
[
0
]
.
replace
(
'"'
,
''
)
.
strip
()
# elif result:
# temp_dict['打印分辨率'] = '|||'.join(result).replace('"', '').strip()
# 打印分辨率_flag_list.append(-100)
if
not
temp_dict
.
get
(
'打印分辨率'
):
result
=
re
.
findall
(
r'(?:打印分辨率(垂直)|打印分辨率(水平)).?"\W?\W?:\W?\W?"(.*?)"'
,
info
)
if
result
:
mid_result
=
'*'
.
join
(
result
)
temp_dict
[
'打印分辨率'
]
=
mid_result
if
not
temp_dict
.
get
(
'打印分辨率'
):
result
=
re
.
findall
(
r'([0-9.]+)\W?\W?\W?(?:dpi|DPI)?\W?([×xX*])\W?([0-9.]+)\W?\W?\W?(dpi|DPI)'
,
info
)
if
result
and
len
(
set
(
result
))
==
1
:
temp_dict
[
'打印分辨率'
]
=
''
.
join
(
result
[
0
])
# elif result:
# mid_result=''
# for item in result:
# mid_result += ''.join(item) + '|||'
# temp_dict['打印分辨率'] = mid_result.replace('"', '').strip()
# 打印分辨率_flag_list.append(-1)
else
:
temp_dict
[
'打印分辨率'
]
=
''
# print('尝试获取进纸盒容量')
# 尝试获取进纸盒容量
if
not
temp_dict
.
get
(
'进纸盒容量'
):
result
=
re
.
findall
(
r'(?:纸张容量|供纸盒容量|标配进纸盒)\W?\W?:\W?\W?"(.*?)"'
,
info
)
if
result
:
if
result
[
0
]
.
strip
()
.
strip
(
','
)
==
''
:
temp_dict
[
'进纸盒容量'
]
=
'原始数据缺失'
else
:
temp_dict
[
'进纸盒容量'
]
=
result
[
0
]
.
replace
(
'"'
,
''
)
.
strip
()
if
not
temp_dict
.
get
(
'进纸盒容量'
):
result
=
re
.
findall
(
r'(?:.{0,15})?进纸盒(?:.{0,15})?'
,
info
)
if
result
:
temp_info
=
''
.
join
(
result
)
mid_result
=
re
.
findall
(
r'([0-9,]+页)(?!\W?/分钟|\W?每分钟|\W?速度|\W?/分|\W?每分|\W?/秒|\W?每秒|\W?/秒钟|\W?每秒钟|\)|)|多用途|出纸盒)'
,
temp_info
)
number_list
=
[]
for
item
in
mid_result
:
number
=
int
(
item
.
replace
(
','
,
''
)
.
strip
(
'页'
))
if
number
>=
100
and
number
<=
10000
:
number_list
.
append
(
str
(
number
)
+
'页'
)
if
number_list
and
len
(
set
(
number_list
))
==
1
:
temp_dict
[
'进纸盒容量'
]
=
number_list
[
0
]
elif
number_list
:
mid_result
=
re
.
findall
(
r'([0-9,]+页)进纸盒'
,
temp_info
)
if
mid_result
:
temp_dict
[
'进纸盒容量'
]
=
result
[
0
]
if
not
temp_dict
.
get
(
'进纸盒容量'
):
result
=
re
.
findall
(
r'([0-9,]+页)(?!\W?/分钟|\W?每分钟|\W?速度|\W?/分|\W?每分|\W?/秒|\W?每秒|\W?/秒钟|\W?每秒钟|\)|)|多用途|出纸盒)'
,
info
)
number_list
=
[]
for
item
in
result
:
number
=
int
(
item
.
replace
(
','
,
''
)
.
strip
(
'页'
))
if
number
>=
100
and
number
<=
10000
:
number_list
.
append
(
str
(
number
)
+
'页'
)
if
number_list
and
len
(
set
(
number_list
))
==
1
:
temp_dict
[
'进纸盒容量'
]
=
number_list
[
0
]
# elif number_list:
# temp_dict['进纸盒容量'] = '|||'.join(number_list).replace('"', '').strip()
# 进纸盒容量_flag_list.append(-1)
else
:
temp_dict
[
'进纸盒容量'
]
=
''
# print('尝试获取网络打印')
# 尝试获取网络打印
if
not
temp_dict
.
get
(
'网络打印'
):
result
=
re
.
findall
(
r'(?:无线功能|打印方式).?\W?\W?\W?:\W?\W?"(.*?)"'
,
info
)
if
result
:
if
result
[
0
]
.
strip
()
.
strip
(
','
)
==
''
:
temp_dict
[
'网络打印'
]
=
'原始数据缺失'
else
:
temp_dict
[
'网络打印'
]
=
result
[
0
]
.
replace
(
'"'
,
''
)
.
strip
()
if
not
temp_dict
.
get
(
'网络打印'
):
result
=
re
.
search
(
r'(无线/有线|有线/无线|有线|无线|支持|不支持)?\W?\W?\W?\W?(网络打印)\W?\W?\W?\W?(无线/有线|有线/无线|有线|无线|支持|不支持)?'
,
info
)
if
result
:
if
result
.
group
(
1
):
temp_dict
[
'网络打印'
]
=
result
.
group
(
1
)
+
'网络打印'
elif
result
.
group
(
3
):
temp_dict
[
'网络打印'
]
=
'网络打印'
+
result
.
group
(
3
)
else
:
temp_dict
[
'网络打印'
]
=
'支持网络打印'
else
:
temp_dict
[
'网络打印'
]
=
''
# print('尝试获取双面打印')
# 尝试获取双面打印
if
not
temp_dict
.
get
(
'双面打印'
):
result
=
re
.
search
(
r'(自动|手动|不支持|支持)?\W?\W?\W?\W?\W?(双面打印|双面)'
,
info
)
if
result
:
if
result
.
group
(
1
):
temp_dict
[
'双面打印'
]
=
result
.
group
(
1
)
+
'双面打印'
else
:
temp_dict
[
'双面打印'
]
=
'支持双面打印'
else
:
temp_dict
[
'双面打印'
]
=
''
# print('尝试获取节能证书编号')
# 尝试获取节能证书编号
if
not
temp_dict
.
get
(
'节能证书编号'
):
result
=
re
.
findall
(
r'(?:CQC|cqc)\W?[0-9]{11}'
,
info
)
if
result
and
len
(
set
(
result
))
==
1
:
temp_dict
[
'节能证书编号'
]
=
result
[
0
]
elif
result
:
mid_result
=
re
.
findall
(
r'节能编号\W?\W?\W?\W?((?:CQC|cqc)\W?[0-9]{11})'
,
info
)
if
mid_result
:
temp_dict
[
'节能证书编号'
]
=
mid_result
[
0
]
else
:
temp_dict
[
'节能证书编号'
]
=
''
# 尝试获取质保时间
# print('尝试获取质保时间')
if
not
temp_dict
.
get
(
'质保时间'
):
result
=
re
.
findall
(
r'(?:.{0,15})?(?:质保|年保)(?:.{0,15})?'
,
info
)
if
result
:
temp_info
=
''
.
join
(
result
)
.
replace
(
"'"
,
""
)
.
replace
(
'"'
,
''
)
result
=
re
.
findall
(
r'(?<![0-9]{3})[1-9]年|[一二三四五六七八九两]年'
,
temp_info
)
number_list
=
[]
for
item
in
result
:
number_list
.
append
(
item
.
replace
(
'一'
,
'1'
)
.
replace
(
'二'
,
'2'
)
.
replace
(
'两'
,
'2'
)
.
replace
(
'三'
,
'3'
)
.
replace
(
'四'
,
'4'
)
.
replace
(
'五'
,
'5'
)
.
replace
(
'六'
,
'6'
)
.
replace
(
'七'
,
'7'
)
.
replace
(
'八'
,
'8'
)
.
replace
(
'九'
,
'9'
))
if
number_list
and
len
(
set
(
number_list
))
==
1
:
temp_dict
[
'质保时间'
]
=
number_list
[
0
]
else
:
result
=
re
.
findall
(
r'(?<![0-9]{3})[1-9]年|[一二三四五六七八九两]年'
,
info
)
number_list
=
[]
for
item
in
result
:
number_list
.
append
(
item
.
replace
(
'一'
,
'1'
)
.
replace
(
'二'
,
'2'
)
.
replace
(
'两'
,
'2'
)
.
replace
(
'三'
,
'3'
)
.
replace
(
'四'
,
'4'
)
.
replace
(
'五'
,
'5'
)
.
replace
(
'六'
,
'6'
)
.
replace
(
'七'
,
'7'
)
.
replace
(
'八'
,
'8'
)
.
replace
(
'九'
,
'9'
))
if
number_list
and
len
(
set
(
number_list
))
==
1
:
temp_dict
[
'质保时间'
]
=
number_list
[
0
]
else
:
temp_dict
[
'质保时间'
]
=
''
msg
=
'激光打印机参数提取成功'
my_dict
=
{
id
:
temp_dict
}
return
msg
,
my_dict
except
Exception
as
e
:
msg
=
'激光打印机参数提取失败'
+
e
return
msg
,
{}
df
=
pd
.
read_excel
(
'激光打印机客户数据0511.xlsx'
,
sheet_name
=
0
,
converters
=
{
'ID'
:
str
,
'SUP_P_NAME'
:
str
,
'SUP_P_PARAMS'
:
str
,
'ZD_P_LASTCATEGORY_NAME'
:
str
,
'ZD_P_BRAND_NAME'
:
str
})
#df = pd.read_excel('扫描仪数据_20210513.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
id_list
=
df
[
'ID'
]
.
tolist
()
sup_p_name_list
=
df
[
'SUP_P_NAME'
]
.
tolist
()
sup_p_params_list
=
df
[
'SUP_P_PARAMS'
]
.
tolist
()
zd_p_brand_name_list
=
df
[
'ZD_P_BRAND_NAME'
]
.
tolist
()
zd_p_lastcategory_name_list
=
df
[
'ZD_P_LASTCATEGORY_NAME'
]
.
tolist
()
'''start_time = time.time()
model_extract_list = list(map(lambda x,y,m:laserprinter_model_extract(x,y,m),sup_p_name_list,sup_p_params_list,zd_p_brand_name_list))
#find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list)
\ No newline at end of file
find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list)
end_time = time.time()
print(f'The runing time is {end_time - start_time} s')'''
if
__name__
==
'__main__'
:
conn_zi_new
=
pymssql
.
connect
(
host
=
'39.107.254.235'
,
user
=
'sa'
,
password
=
'1qaz@WSX'
,
database
=
'ZD_DW_dev'
,
autocommit
=
True
)
cursor
=
conn_zi_new
.
cursor
()
cursor
.
execute
(
"select ID, SUP_P_NAME, SUP_P_PARAMS from DW_PRODUCT_ALL where SUP_P_LASTCATEGORY_NAME = '激光打印机'"
)
data_df
=
pd
.
DataFrame
(
cursor
.
fetchall
(),
columns
=
[
tuple
[
0
]
for
tuple
in
cursor
.
description
])
id_list
=
data_df
[
'ID'
]
.
tolist
()
name_list
=
data_df
[
'SUP_P_NAME'
]
.
tolist
()
param_list
=
data_df
[
'SUP_P_PARAMS'
]
.
tolist
()
brand_Name
=
'随便'
requier_param_list
=
[
'产品类型'
,
'最大打印幅面'
,
'彩色打印速度'
,
'黑白打印速度'
,
'打印分辨率'
,
'进纸盒容量'
,
'网络打印'
,
'双面打印'
,
'节能证书编号'
,
'质保时间'
]
for
i
in
requier_param_list
:
exec
(
f
'{i}_list=[]'
)
# exec(f'{i}_flag_list=[]')
t1
=
time
.
time
()
for
id
,
name
,
param
in
zip
(
id_list
,
name_list
,
param_list
):
_
,
temp_dict2
=
激光打印机参数提取
(
id
,
brand_Name
,
name
,
param
,
requier_param_list
)
for
i
in
requier_param_list
:
# try:
exec
(
f
"{i}_list.append('{temp_dict2[id][i]}')"
)
# except Exception as e:
# print(temp_dict2[id][i],e)
# break
t2
=
time
.
time
()
df_output
=
pd
.
DataFrame
(
id_list
)
# print(产品类型_list)
for
i
in
requier_param_list
:
exec
(
f
"df_output[i]={i}_list"
)
# exec(f"df_output['{i}_flag']={i}_flag_list")
df_output
[
'sup_p_name'
]
=
name_list
df_output
[
'sup_p_param'
]
=
param_list
writer
=
pd
.
ExcelWriter
(
f
"激光打印机客户数据0511_参数提取.xlsx"
)
df_output
.
to_excel
(
writer
,
f
'参数提取'
)
writer
.
save
()
writer
.
close
()
print
(
t2
-
t1
)
#标配外服务及配件,标配外耗材,
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment