Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
5f7f3949
Commit
5f7f3949
authored
May 08, 2021
by
Jialin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代码更新
parent
62dacfbd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
58 additions
and
45 deletions
+58
-45
产品重复型号分析.py
公共代码/产品重复型号分析.py
+58
-45
No files found.
公共代码/产品重复型号分析.py
View file @
5f7f3949
...
...
@@ -33,6 +33,7 @@ def product_washing(filepath, category, thre=1, a=0):
for
param
in
other_parameters_fetch
:
other_parameters
.
append
(
param
[
0
])
point_category_list
=
[
'台式机'
,
'笔记本'
,
'一体电脑'
,
'复印纸'
]
related_product
=
[]
brand_grouped
=
df
.
groupby
(
by
=
'产品品牌'
)
for
brand
in
brand_grouped
:
...
...
@@ -41,38 +42,47 @@ def product_washing(filepath, category, thre=1, a=0):
tempo_dict
=
{}
# 每个品牌提取产品型号关键字,放入tempo_dict
for
i
in
brand
[
1
]
.
index
:
k
=
brand
[
1
]
.
loc
[
i
,
'*产品型号'
]
if
k
in
invalid_list
:
continue
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
if
pre_num
:
pre_num
=
[
pre_num
.
group
(
1
)]
if
not
pre_num
:
pre_num
=
[]
num
=
re
.
search
(
r'(\d+)'
,
k
)
# num为数字关键字
if
num
:
num
=
[
num
.
group
(
1
)]
# 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if
not
num
:
# 如果没有数字,就比较英文单词
num
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
pos_num
=
re
.
findall
(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?'
,
k
)
# pos_num为数字后的关键字
if
pos_num
:
pos_num
=
list
(
pos_num
[
0
])
if
'升级版'
in
k
:
pos_num
+=
'升级版'
if
'专业版'
in
k
:
pos_num
+=
'专业版'
if
'教育版'
in
k
:
pos_num
+=
'教育版'
if
'+'
in
k
:
pos_num
+=
'+'
combined
=
pre_num
+
num
+
pos_num
# 将关键字列表合并
while
''
in
combined
:
combined
.
remove
(
''
)
if
category
in
point_category_list
:
if
category
==
'复印纸'
:
k
=
brand
[
1
]
.
loc
[
i
,
'*产品系列'
]
else
:
k
=
brand
[
1
]
.
loc
[
i
,
'*产品子系列'
]
k
.
replace
(
' '
,
''
)
.
replace
(
'系列'
,
''
)
.
replace
(
'_'
,
''
)
.
replace
(
'-'
,
''
)
.
upper
()
tempo_dict
[
i
]
=
[
k
]
else
:
k
=
brand
[
1
]
.
loc
[
i
,
'*产品型号'
]
if
k
in
invalid_list
:
continue
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
if
pre_num
:
pre_num
=
[
pre_num
.
group
(
1
)]
if
not
pre_num
:
pre_num
=
[]
num
=
re
.
search
(
r'(\d+)'
,
k
)
# num为数字关键字
if
num
:
num
=
[
num
.
group
(
1
)]
# 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if
not
num
:
# 如果没有数字,就比较英文单词
num
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
pos_num
=
re
.
findall
(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?'
,
k
)
# pos_num为数字后的关键字
if
pos_num
:
pos_num
=
list
(
pos_num
[
0
])
if
'升级版'
in
k
:
pos_num
+=
'升级版'
if
'专业版'
in
k
:
pos_num
+=
'专业版'
if
'教育版'
in
k
:
pos_num
+=
'教育版'
if
'+'
in
k
:
pos_num
+=
'+'
combined
=
pre_num
+
num
+
pos_num
# 将关键字列表合并
while
''
in
combined
:
combined
.
remove
(
''
)
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
...
...
@@ -98,7 +108,7 @@ def product_washing(filepath, category, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
tempo_dict
[
i
]
=
[
set
(
combined
)]
tempo_dict
[
i
]
=
[
set
(
combined
)]
other_parameters_values
=
[]
for
parameter
in
other_parameters
:
...
...
@@ -119,16 +129,19 @@ def product_washing(filepath, category, thre=1, a=0):
if
count
!=
0
:
tested_product
.
append
(
set
([
i
,
j
]))
break
# 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
accuracy_i
=
0
accuracy_j
=
0
for
word_i
in
tempo_dict
[
i
][
0
]:
if
word_i
in
tempo_dict
[
j
][
0
]:
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
0
])
+
a
/
len
(
tempo_dict
[
i
][
0
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
0
])
+
a
/
len
(
tempo_dict
[
j
][
0
]))
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
not
(
df
.
loc
[
i
,
'*产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'*产品型号'
]
.
endswith
(
'+'
)):
related_product
.
append
(
set
([
i
,
j
]))
if
category
in
point_category_list
:
if
tempo_dict
[
i
][
0
]
==
tempo_dict
[
j
][
0
]:
related_product
.
append
(
set
([
i
,
j
]))
else
:
accuracy_i
=
0
accuracy_j
=
0
for
word_i
in
tempo_dict
[
i
][
0
]:
if
word_i
in
tempo_dict
[
j
][
0
]:
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
0
])
+
a
/
len
(
tempo_dict
[
i
][
0
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
0
])
+
a
/
len
(
tempo_dict
[
j
][
0
]))
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
not
(
df
.
loc
[
i
,
'*产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'*产品型号'
]
.
endswith
(
'+'
)):
related_product
.
append
(
set
([
i
,
j
]))
tested_product
.
append
(
set
([
i
,
j
]))
# a = set([i])
...
...
@@ -179,6 +192,6 @@ def product_washing(filepath, category, thre=1, a=0):
if
__name__
==
'__main__'
:
filepath
=
"E:
\\
ZDZC
\\
激光打印机
参数确认.xlsx"
category
=
'
激光打印机
'
filepath
=
"E:
\\
ZDZC
\\
Sourcetree_local
\\
公共代码
\\
一体电脑
参数确认.xlsx"
category
=
'
一体电脑
'
product_washing
(
filepath
,
category
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment