Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
9baca2a8
Commit
9baca2a8
authored
May 25, 2021
by
LIANGZEYAN
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
激光打印机型号提取
parent
2a685cbb
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
221 additions
and
0 deletions
+221
-0
激光打印机型号提取.py
公共代码/激光打印机型号提取.py
+221
-0
No files found.
公共代码/激光打印机型号提取.py
0 → 100644
View file @
9baca2a8
# coding:utf-8
import
re
import
pandas
as
pd
"""
Created on Tue May 25 14:56:22 2020
@author: SoreLemon
@Target: 适用于激光打印机型号提取.
@Input: <Line 211> This is the file path(usually be EXCEL file <xlsx>).
@Output: <Line 219> A list contains extracted model./<Line 220> 方便查看,也可生成两个excel(一个装提取到的型号和原数据,一个装没提取到型号的原数据).
"""
def
laserprinter_model_extract
(
productName
,
productParams
,
brand
):
model
=
""
row
=
productName
.
replace
(
'('
,
'('
)
.
replace
(
')'
,
')'
)
row
=
row
.
replace
(
"A3"
,
""
)
row
=
row
.
replace
(
"A4"
,
""
)
row1
=
str
(
productParams
)
.
replace
(
"'"
,
""
)
.
replace
(
" "
,
""
)
.
replace
(
"
\n
"
,
""
)
if
len
(
re
.
findall
(
r"产品型号:[a-z|A-Z|0-9|-]+,"
,
row1
))
!=
0
:
if
re
.
findall
(
r"产品型号:(.+?),"
,
row1
)[
0
]
!=
"-"
:
model
=
(
re
.
findall
(
r"产品型号:(.+?),"
,
row1
)[
0
])
else
:
re_kuohao
=
r'\(.*?\)'
row
=
re
.
sub
(
re_kuohao
,
''
,
row
)
model
=
(
re
.
findall
(
r'[a-z|A-Z|0-9|-]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+'
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[0-9]+[a-z|A-Z|]+'
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[0-9]+[a-z|A-Z|]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
"型号:[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+,"
,
row1
))
!=
0
:
model
=
(
re
.
findall
(
r"型号:([a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+),"
,
row1
)[
0
])
elif
len
(
re
.
findall
(
"型号:[a-z|A-Z|0-9|-|+]+,"
,
row1
))
!=
0
:
model
=
(
re
.
findall
(
r"型号:(.+?),"
,
row1
)[
0
])
elif
len
(
re
.
findall
(
r'[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+'
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r'[a-z|A-Z]{1,7}-[a-z|A-Z|0-9|+]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+[0-9|a-z|A-Z|+]+'
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+[0-9|a-z|A-Z|+]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
r"[a-z]+[0-9]+|[A-Z]+[0-9]+"
,
row
))
!=
0
:
if
model
==
'A3'
or
'A4'
:
if
len
(
re
.
findall
(
r"[0-9]+[a-z]+|[0-9]+[A-Z]+"
,
row
))
!=
0
:
model
=
re
.
findall
(
r"[0-9]+[a-z]+|[0-9]+[A-Z]+"
,
row
)[
0
]
else
:
model
=
'暂无客户数据'
else
:
model
=
(
re
.
findall
(
r"[a-z]+[0-9]+|[A-Z]+[0-9]+"
,
row
)[
0
])
elif
len
(
re
.
findall
(
r"[A-Z]+[0-9]+[A-Z]+"
,
row1
))
!=
0
:
model
=
(
re
.
findall
(
r"[A-Z]+[0-9]+[A-Z]+"
,
row1
)[
0
])
elif
len
(
re
.
findall
(
r"型号:(.+?),"
,
row1
))
!=
0
:
model
=
(
re
.
findall
(
r"型号:(.+?),"
,
row1
)[
0
])
elif
len
(
re
.
findall
(
r"型号:(.+?)}"
,
row1
.
replace
(
":"
,
""
)))
!=
0
:
model
=
(
re
.
findall
(
r"型号:(.+?)}"
,
row1
.
replace
(
":"
,
""
))[
0
])
elif
len
(
re
.
findall
(
r"[A-Z|a-z| ]+"
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r"[A-Z|a-z| ]+"
,
row
)[
0
])
elif
len
(
re
.
findall
(
r"[0-9]+[A-Z]+"
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r"[0-9]+[A-Z]+"
,
row
)[
0
])
else
:
model
=
(
'暂无客户数据'
)
#判断如果为全英文,则继续正则提取
if
model
.
isalpha
()
is
True
:
if
len
(
re
.
findall
(
r"[0-9]+[a-z]+|[0-9]+[A-Z]+"
,
row
))
!=
0
:
model
=
re
.
findall
(
r"[0-9]+[a-z]+|[0-9]+[A-Z]+"
,
row
)[
0
]
else
:
model
=
'暂无客户数据'
else
:
#如果不是全英文,则判断是否为全中文,如果为全中文则数据无效
for
_char
in
model
:
if
not
'
\u4e00
'
<=
_char
<=
'
\u9fa5
'
:
break
else
:
model
=
'暂无客户数据'
model
=
model
.
strip
()
if
model
==
""
:
model
=
'暂无客户数据'
if
model
==
'暂无客户数据'
:
if
len
(
re
.
findall
(
r'[a-z|A-Z]+\ [0-9]+\+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[a-z|A-Z]+\ [0-9]+\+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+\ [0-9]+\+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[a-z|A-Z]+\ [0-9]+\+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[A-Z]+\ [0-9]+[0-9]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[A-Z]+\ [0-9]+[0-9]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[0-9]+\+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[0-9]+\+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[0-9]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[0-9]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+[0-9]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+[0-9]+'
,
row
)[
0
]
if
model
in
[
'2010'
,
'2011'
,
'2012'
,
'2013'
,
'2014'
,
'2015'
,
'2016'
,
'2017'
,
'2018'
,
'2019'
,
'2020'
,
'2021'
]
:
model
=
"暂无客户数据"
else
:
model
=
'暂无客户数据'
else
:
model
=
model
.
strip
()
if
len
(
model
)
>
11
:
if
len
(
re
.
findall
(
r'[a-z|A-Z]+-[0-9]+[a-z|A-Z]+'
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r'[a-z|A-Z]+-[0-9]+[a-z|A-Z]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+'
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+-[a-z|A-Z]+-[a-z|A-Z]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+-[0-9]+'
,
row
))
!=
0
:
model
=
(
re
.
findall
(
r'[a-z|A-Z]+-[0-9]+'
,
row
)[
0
])
elif
len
(
re
.
findall
(
r'[A-Z][0-9]+[A-Z|a-z]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[A-Z][0-9]+[A-Z|a-z]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[a-z|A-Z]+[0-9]+[a-z|A-Z]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[a-z|A-Z]+[0-9]+[a-z|A-Z]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+[A-Z|a-z]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[A-Z|a-z]+[0-9]+[0-9]+[0-9]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[A-Z|a-z]+[0-9]+[0-9]+[0-9]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[0-9]+[0-9]+[0-9]+\ [A-Z|a-z]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[A-Z|a-z]+\ [0-9]+[0-9]+[0-9]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[A-Z|a-z]+\ [0-9]+[0-9]+[0-9]+'
,
row
)[
0
]
elif
len
(
re
.
findall
(
r'[A-Z|a-z]+-[0-9]+[0-9]+[0-9]+'
,
row
))
!=
0
:
model
=
re
.
findall
(
r'[A-Z|a-z]+-[0-9]+[0-9]+[0-9]+'
,
row
)[
0
]
else
:
model
=
'暂无客户数据'
model
=
model
.
replace
(
" "
,
""
)
return
model
#帮助优化正则,找出没找到型号的产品的id
def
find_nonmatch_model_id
(
model_extract_list
,
id_list
,
sup_p_name_list
):
dict_model
=
{
"id"
:
"model"
}
dict_name
=
{
"id"
:
"name"
}
index_row
=
0
model_find_number
=
0
model_notfind_number
=
0
model_find_name_list
=
[]
model_find_list
=
[]
model_notfind_name_list
=
[]
model_notfind_list
=
[]
for
i
in
id_list
:
dict_model
[
i
]
=
model_extract_list
[
index_row
]
dict_name
[
i
]
=
sup_p_name_list
[
index_row
]
index_row
+=
1
for
j
in
id_list
:
if
'暂无客户数据'
in
dict_model
.
get
(
j
):
model_notfind_number
+=
1
temp
=
dict_name
.
get
(
j
)
model_notfind_name_list
.
append
(
dict_model
.
get
(
j
))
model_notfind_list
.
append
(
temp
)
else
:
model_find_number
+=
1
temp
=
dict_name
.
get
(
j
)
model_find_name_list
.
append
(
dict_model
.
get
(
j
))
model_find_list
.
append
(
temp
)
dict_notfind_model
=
{
'未找到产品型号的产品名称'
:
model_notfind_name_list
,
'未找到产品型号的产品型号'
:
model_notfind_list
}
dict_find_model
=
{
'找到产品型号的产品名称'
:
model_find_name_list
,
'找到产品型号的产品型号'
:
model_find_list
}
df_notfind
=
pd
.
DataFrame
(
dict_notfind_model
,
columns
=
[
'未找到产品型号的产品名称'
,
'未找到产品型号的产品型号'
])
df_find
=
pd
.
DataFrame
(
dict_find_model
,
columns
=
[
'找到产品型号的产品名称'
,
'找到产品型号的产品型号'
])
df_notfind
.
to_excel
(
r'激光打印机客户数据0511_妹找到型号.xlsx'
,
index
=
False
)
df_find
.
to_excel
(
r'激光打印机客户数据0511_找到了型号.xlsx'
,
index
=
False
)
print
(
"找到型号的数量为"
)
print
(
model_find_number
)
print
(
"妹找到型号的数量为"
)
print
(
model_notfind_number
)
df
=
pd
.
read_excel
(
'激光打印机客户数据0511.xlsx'
,
sheet_name
=
0
,
converters
=
{
'ID'
:
str
,
'SUP_P_NAME'
:
str
,
'SUP_P_PARAMS'
:
str
,
'ZD_P_LASTCATEGORY_NAME'
:
str
,
'ZD_P_BRAND_NAME'
:
str
})
#df = pd.read_excel('扫描仪数据_20210513.xlsx',sheet_name = 0,converters={'ID':str,'SUP_P_NAME':str,'SUP_P_PARAMS':str,'ZD_P_LASTCATEGORY_NAME':str,'ZD_P_BRAND_NAME':str})
id_list
=
df
[
'ID'
]
.
tolist
()
sup_p_name_list
=
df
[
'SUP_P_NAME'
]
.
tolist
()
sup_p_params_list
=
df
[
'SUP_P_PARAMS'
]
.
tolist
()
zd_p_brand_name_list
=
df
[
'ZD_P_BRAND_NAME'
]
.
tolist
()
zd_p_lastcategory_name_list
=
df
[
'ZD_P_LASTCATEGORY_NAME'
]
.
tolist
()
model_extract_list
=
list
(
map
(
lambda
x
,
y
,
m
:
laserprinter_model_extract
(
x
,
y
,
m
),
sup_p_name_list
,
sup_p_params_list
,
zd_p_brand_name_list
))
#find_nonmatch_model_id(model_extract_list,id_list,sup_p_name_list)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment