Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
c040d93d
Commit
c040d93d
authored
Apr 14, 2021
by
Jialin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代码最终修改
parent
d54af336
Changes
5
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
74 additions
and
33 deletions
+74
-33
产品品牌分析.py
公共代码/产品品牌分析.py
+56
-25
产品类别分析.py
公共代码/产品类别分析.py
+0
-0
产品重复型号分析.py
公共代码/产品重复型号分析.py
+18
-8
异常数据表格.xlsx
公共代码/异常数据表格.xlsx
+0
-0
激光打印机参数确认.xlsx
公共代码/激光打印机参数确认.xlsx
+0
-0
No files found.
公共代码/产品品牌分析.py
View file @
c040d93d
...
@@ -5,6 +5,7 @@
...
@@ -5,6 +5,7 @@
import
pandas
as
pd
import
pandas
as
pd
import
re
import
re
import
xlsxwriter
import
xlsxwriter
import
numpy
as
np
def
brand_washing
(
filepath
,
thre
=
0.5
,
inner_thre
=
0.5
,
a
=
1
,
sheet_name
=
0
):
def
brand_washing
(
filepath
,
thre
=
0.5
,
inner_thre
=
0.5
,
a
=
1
,
sheet_name
=
0
):
...
@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
...
@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 处理缺失值
# 处理缺失值
valid_index
=
[]
valid_index
=
[]
for
i
in
df
.
index
:
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
'产品品牌'
]
not
in
invalid_list
and
df
.
loc
[
i
,
'产品型号'
]
not
in
invalid_list
:
if
df
.
loc
[
i
,
'产品品牌'
]
not
in
invalid_list
and
df
.
loc
[
i
,
'
*
产品型号'
]
not
in
invalid_list
:
valid_index
.
append
(
i
)
valid_index
.
append
(
i
)
result
=
df
.
loc
[
valid_index
]
result
=
df
.
loc
[
valid_index
]
# 将df数据格式转为字符串
# 将df数据格式转为字符串
...
@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
...
@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 将df按品牌分类,按品牌提取每个型号的关键字,放入字典
# 将df按品牌分类,按品牌提取每个型号的关键字,放入字典
brand_type
=
result
.
groupby
(
'产品品牌'
)
brand_type
=
result
.
groupby
(
'产品品牌'
)
for
brand
in
brand_type
:
for
brand
in
brand_type
:
result_unique
=
brand
[
1
][
'产品型号'
]
.
unique
()
# result_unique此时是array,元素是一个品牌名下的型号
result_unique
=
brand
[
1
][
'
*
产品型号'
]
.
unique
()
# result_unique此时是array,元素是一个品牌名下的型号
for
j
in
range
(
len
(
result_unique
)):
for
j
in
range
(
len
(
result_unique
)):
result_unique
[
j
]
=
result_unique
[
j
]
.
upper
()
.
strip
()
result_unique
[
j
]
=
result_unique
[
j
]
.
upper
()
.
strip
()
result_unique
=
pd
.
DataFrame
(
result_unique
)[
0
]
.
unique
()
# 全部变为大写,将大小写归一,result_unique此时是array
result_unique
=
pd
.
DataFrame
(
result_unique
)[
0
]
.
unique
()
# 全部变为大写,将大小写归一,result_unique此时是array
...
@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
...
@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似
# 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似
type_kw
=
{}
# 用于接收品牌型号提取的关键字
type_kw
=
{}
# 用于接收品牌型号提取的关键字
for
i
in
brand_type
:
for
i
in
brand_type
:
result_unique
=
i
[
1
][
'产品型号'
]
.
unique
()
# 品牌型号组成的数组,数组内无重复元素
result_unique
=
i
[
1
][
'
*
产品型号'
]
.
unique
()
# 品牌型号组成的数组,数组内无重复元素
for
k
in
range
(
len
(
result_unique
)):
for
k
in
range
(
len
(
result_unique
)):
result_unique
[
k
]
=
result_unique
[
k
]
.
upper
()
.
strip
()
result_unique
[
k
]
=
result_unique
[
k
]
.
upper
()
.
strip
()
result_unique
=
pd
.
DataFrame
(
result_unique
)[
0
]
.
unique
()
result_unique
=
pd
.
DataFrame
(
result_unique
)[
0
]
.
unique
()
...
@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
...
@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
related_brand3
.
append
(
tempo_list
)
related_brand3
.
append
(
tempo_list
)
#写入excel
#写入excel
tempo_list
=
[]
# 将所有的相似品牌,两两一对,写入集合,放入tempo_list
method1
=
[]
method2
=
[]
method3
=
[]
related_brand_list
=
[
related_brand1
,
related_brand2
,
related_brand3
]
method_list
=
[
method1
,
method2
,
method3
]
for
i
in
range
(
len
(
related_brand_list
)):
for
list_i
in
related_brand_list
[
i
]:
tempo_list
.
append
(
set
(
list_i
[:
2
]))
method_list
[
i
]
.
append
(
set
(
list_i
[:
2
]))
final_list
=
[]
# final_list就是tempo_list的去重
for
item
in
tempo_list
:
if
item
not
in
final_list
:
final_list
.
append
(
item
)
method
=
[]
for
item
in
final_list
:
linshi_list
=
[]
if
item
in
method1
:
linshi_list
.
append
(
'1'
)
if
item
in
method2
:
linshi_list
.
append
(
'2'
)
if
item
in
method3
:
linshi_list
.
append
(
'3'
)
method
.
append
(
','
.
join
(
linshi_list
))
workbook
=
xlsxwriter
.
Workbook
(
'./brand_filter.xlsx'
)
workbook
=
xlsxwriter
.
Workbook
(
'./brand_filter.xlsx'
)
bold_format
=
workbook
.
add_format
({
'bold'
:
True
})
bold_format
=
workbook
.
add_format
({
'bold'
:
True
})
...
@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
...
@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
worksheet
.
write
(
'D1'
,
'方法'
,
bold_format
)
worksheet
.
write
(
'D1'
,
'方法'
,
bold_format
)
col
=
0
col
=
0
row
=
1
row
=
1
for
list_i
in
related_brand1
:
for
index
in
range
(
len
(
final_list
))
:
for
brand_i
in
range
(
2
)
:
for
brand_i
in
final_list
[
index
]
:
worksheet
.
write_string
(
row
,
col
,
list_i
[
brand_i
]
)
worksheet
.
write_string
(
row
,
col
,
brand_i
)
col
+=
1
col
+=
1
col
=
3
col
=
3
worksheet
.
write_string
(
row
,
col
,
'1'
)
worksheet
.
write_string
(
row
,
col
,
method
[
index
]
)
row
+=
1
row
+=
1
col
=
0
col
=
0
for
list_i
in
related_brand2
:
#
for list_i in related_brand2:
for
brand_i
in
range
(
2
):
#
for brand_i in range(2):
worksheet
.
write_string
(
row
,
col
,
list_i
[
brand_i
])
#
worksheet.write_string(row, col, list_i[brand_i])
col
+=
1
#
col += 1
col
=
3
#
col = 3
worksheet
.
write_string
(
row
,
col
,
'2'
)
#
worksheet.write_string(row, col, '2')
row
+=
1
#
row += 1
col
=
0
#
col = 0
#
for
list_i
in
related_brand3
:
#
for list_i in related_brand3:
for
brand_i
in
range
(
2
):
#
for brand_i in range(2):
worksheet
.
write_string
(
row
,
col
,
list_i
[
brand_i
])
#
worksheet.write_string(row, col, list_i[brand_i])
col
+=
1
#
col += 1
col
=
3
#
col = 3
worksheet
.
write_string
(
row
,
col
,
'3'
)
#
worksheet.write_string(row, col, '3')
row
+=
1
#
row += 1
col
=
0
#
col = 0
# 第二个worksheet
# 第二个worksheet
worksheet2
=
workbook
.
add_worksheet
(
name
=
'Sheet2'
)
worksheet2
=
workbook
.
add_worksheet
(
name
=
'Sheet2'
)
...
@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
...
@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
workbook
.
close
()
workbook
.
close
()
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
filepath
=
'E:
\\
ZDZC
\\
激光打印机
参数确认.xlsx'
filepath
=
'E:
\\
ZDZC
\\
扫描仪
参数确认.xlsx'
brand_washing
(
filepath
)
brand_washing
(
filepath
)
公共代码/产品类别分析.py
View file @
c040d93d
This diff is collapsed.
Click to expand it.
公共代码/产品重复型号分析.py
View file @
c040d93d
...
@@ -12,7 +12,7 @@ import numpy as np
...
@@ -12,7 +12,7 @@ import numpy as np
import
xlsxwriter
import
xlsxwriter
def
product_washing
(
filepath
,
thre
=
1
,
a
=
0
):
def
product_washing
(
filepath
,
category
,
thre
=
1
,
a
=
0
):
df_null
=
pd
.
read_excel
(
".
\\
异常数据表格.xlsx"
)
df_null
=
pd
.
read_excel
(
".
\\
异常数据表格.xlsx"
)
invalid_list
=
df_null
[
'异常数据名称'
]
.
values
invalid_list
=
df_null
[
'异常数据名称'
]
.
values
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
...
@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
...
@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
tempo_dict
=
{}
tempo_dict
=
{}
# 每个品牌提取产品型号关键字,放入tempo_dict
# 每个品牌提取产品型号关键字,放入tempo_dict
for
i
in
brand
[
1
]
.
index
:
for
i
in
brand
[
1
]
.
index
:
k
=
brand
[
1
]
.
loc
[
i
,
'产品型号'
]
k
=
brand
[
1
]
.
loc
[
i
,
'
*
产品型号'
]
if
k
in
invalid_list
:
if
k
in
invalid_list
:
continue
continue
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
...
@@ -85,16 +85,25 @@ def product_washing(filepath, thre=1, a=0):
...
@@ -85,16 +85,25 @@ def product_washing(filepath, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
# brand_combined = temp_list1+temp_list2
tempo_dict
[
i
]
=
[
set
(
combined
)]
other_parameters
=
df_null
[
'重复参数项'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'重复参数项'
]
.
notnull
()]
.
values
other_parameters_values
=
[]
for
parameter
in
other_parameters
:
other_parameters_values
.
append
(
brand
[
1
]
.
loc
[
i
,
parameter
])
tempo_dict
[
i
]
.
extend
(
other_parameters_values
)
tempo_dict
[
i
]
=
[
set
(
combined
),
brand
[
1
]
.
loc
[
i
,
'*质保时间'
],
brand
[
1
]
.
loc
[
i
,
'标配外服务及配件'
]]
# 对比产品
型号关键字
,相同则放入related_product
# 对比产品
参数项
,相同则放入related_product
tested_product
=
[]
tested_product
=
[]
for
i
in
tempo_dict
:
for
i
in
tempo_dict
:
for
j
in
tempo_dict
:
for
j
in
tempo_dict
:
if
i
!=
j
and
set
([
i
,
j
])
not
in
tested_product
:
if
i
!=
j
and
set
([
i
,
j
])
not
in
tested_product
:
if
tempo_dict
[
i
][
1
:]
==
tempo_dict
[
j
][
1
:]:
for
index
in
range
(
1
,
len
(
tempo_dict
[
i
])):
if
tempo_dict
[
i
][
index
]
!=
tempo_dict
[
j
][
index
]
and
\
(
tempo_dict
[
i
][
index
]
not
in
invalid_list
and
tempo_dict
[
j
][
index
]
not
in
invalid_list
):
tested_product
.
append
(
set
([
i
,
j
]))
break
# 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
accuracy_i
=
0
accuracy_i
=
0
accuracy_j
=
0
accuracy_j
=
0
for
word_i
in
tempo_dict
[
i
][
0
]:
for
word_i
in
tempo_dict
[
i
][
0
]:
...
@@ -102,7 +111,7 @@ def product_washing(filepath, thre=1, a=0):
...
@@ -102,7 +111,7 @@ def product_washing(filepath, thre=1, a=0):
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
0
])
+
a
/
len
(
tempo_dict
[
i
][
0
]))
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
0
])
+
a
/
len
(
tempo_dict
[
i
][
0
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
0
])
+
a
/
len
(
tempo_dict
[
j
][
0
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
0
])
+
a
/
len
(
tempo_dict
[
j
][
0
]))
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
not
(
df
.
loc
[
i
,
'产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'
产品型号'
]
.
endswith
(
'+'
)):
if
not
(
df
.
loc
[
i
,
'*产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'*
产品型号'
]
.
endswith
(
'+'
)):
related_product
.
append
(
set
([
i
,
j
]))
related_product
.
append
(
set
([
i
,
j
]))
tested_product
.
append
(
set
([
i
,
j
]))
tested_product
.
append
(
set
([
i
,
j
]))
...
@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):
...
@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
filepath
=
"E:
\\
ZDZC
\\
激光打印机参数确认.xlsx"
filepath
=
"E:
\\
ZDZC
\\
扫描仪参数确认.xlsx"
product_washing
(
filepath
)
category
=
'扫描仪'
product_washing
(
filepath
,
category
)
公共代码/异常数据表格.xlsx
0 → 100644
View file @
c040d93d
File added
公共代码/激光打印机参数确认.xlsx
0 → 100644
View file @
c040d93d
File added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment