Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
d54af336
Commit
d54af336
authored
Apr 12, 2021
by
Jialin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代码修改
parent
a5316846
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
91 additions
and
58 deletions
+91
-58
产品品牌分析.py
公共代码/产品品牌分析.py
+8
-13
产品类别分析.py
公共代码/产品类别分析.py
+46
-18
产品重复型号分析.py
公共代码/产品重复型号分析.py
+19
-16
扫描仪产品品牌分析.xlsx
公共代码/扫描仪产品品牌分析.xlsx
+0
-0
爬虫信息分析.py
公共代码/爬虫信息分析.py
+18
-11
No files found.
公共代码/产品品牌分析.py
View file @
d54af336
...
...
@@ -3,7 +3,6 @@
import
pandas
as
pd
import
numpy
as
np
import
re
import
xlsxwriter
...
...
@@ -11,10 +10,14 @@ import xlsxwriter
def
brand_washing
(
filepath
,
thre
=
0.5
,
inner_thre
=
0.5
,
a
=
1
,
sheet_name
=
0
):
# filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名
df
=
pd
.
read_excel
(
filepath
,
sheet_name
=
sheet_name
,
converters
=
{
'产品编码'
:
str
})
df_null
=
pd
.
read_excel
(
".
\\
异常数据表格.xlsx"
)
invalid_list
=
df_null
[
'异常数据名称'
]
.
values
# 处理缺失值
col1
=
(
df
[
'产品品牌'
]
!=
'暂无数据'
)
==
((
df
[
'产品品牌'
]
!=
'无参数,需补充'
)
==
(
df
[
'产品品牌'
]
.
notnull
()))
col2
=
(
df
[
'产品型号'
]
!=
'暂无数据'
)
==
((
df
[
'产品型号'
]
!=
'无参数,需补充'
)
==
(
df
[
'产品型号'
]
.
notnull
()))
result
=
df
.
loc
[
df
.
index
[
col1
==
col2
]]
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
'产品品牌'
]
not
in
invalid_list
and
df
.
loc
[
i
,
'产品型号'
]
not
in
invalid_list
:
valid_index
.
append
(
i
)
result
=
df
.
loc
[
valid_index
]
# 将df数据格式转为字符串
for
i
in
result
.
columns
:
result
[
i
]
=
result
[
i
]
.
astype
(
str
)
...
...
@@ -152,7 +155,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
tempo_list
.
extend
(
word_list
)
related_brand3
.
append
(
tempo_list
)
#
写入excel
#写入excel
workbook
=
xlsxwriter
.
Workbook
(
'./brand_filter.xlsx'
)
bold_format
=
workbook
.
add_format
({
'bold'
:
True
})
...
...
@@ -162,11 +165,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
worksheet
.
write
(
'B1'
,
'品牌B'
,
bold_format
)
worksheet
.
write
(
'C1'
,
'正确品牌'
,
bold_format
)
worksheet
.
write
(
'D1'
,
'方法'
,
bold_format
)
# worksheet.write('E1', '品牌B-2', bold_format)
# worksheet.write('F1', '品牌-2', bold_format)
# worksheet.write('G1', '品牌A-3', bold_format)
# worksheet.write('H1', '品牌B-3', bold_format)
# worksheet.write('I1', '品牌-3', bold_format)
col
=
0
row
=
1
for
list_i
in
related_brand1
:
...
...
@@ -238,9 +236,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
row
-=
2
col
+=
1
row
+=
3
workbook
.
close
()
if
__name__
==
'__main__'
:
...
...
公共代码/产品类别分析.py
View file @
d54af336
...
...
@@ -16,14 +16,19 @@ import numpy as np
def
class_washing
(
category
,
filepath
,
c_list
,
a
=
0.02
,
b
=
0.01
):
df_null
=
pd
.
read_excel
(
".
\\
异常数据表格.xlsx"
)
invalid_list
=
df_null
[
'异常数据名称'
]
.
values
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
df
.
drop
(
columns
=
'Unnamed: 0'
,
axis
=
1
,
inplace
=
True
)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
dtype_minor_dict
=
{}
for
col
in
df
.
columns
:
type_list
=
{}
valid_index
=
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]
for
i
in
valid_index
:
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
data_type
=
type
(
df
.
loc
[
i
,
col
])
if
data_type
not
in
type_list
:
type_list
[
data_type
]
=
1
...
...
@@ -58,7 +63,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
father_brand_minor
=
[]
father_brand_list
=
[]
col
=
'产品父品牌'
valid_df
=
df
.
loc
[
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]]
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
valid_df
=
df
.
loc
[
valid_index
]
father_brand_num
=
valid_df
.
groupby
(
by
=
'产品父品牌'
)[
'产品编码'
]
.
count
()
# 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
father_num_list
=
[
x
for
x
in
father_brand_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
a
]
for
i
in
father_brand_num
.
index
:
# i 就是产品父品牌
...
...
@@ -73,7 +83,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
brand_minor
=
[]
brand_list
=
[]
col
=
'产品品牌'
valid_df
=
df
.
loc
[
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]]
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
valid_df
=
df
.
loc
[
valid_index
]
brand_num
=
valid_df
.
groupby
(
by
=
'产品品牌'
)[
'产品编码'
]
.
count
()
# 同上
num_list
=
[
x
for
x
in
brand_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
a
]
for
i
in
brand_num
.
index
:
...
...
@@ -88,26 +103,34 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
length_minor_dict
=
{}
for
col
in
df
.
columns
[
7
:
-
2
]:
col_length
=
[]
valid_index
=
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]
for
i
in
valid_index
:
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
col_length
.
append
(
len
(
df
.
loc
[
i
,
col
]))
std
=
np
.
array
(
col_length
)
.
std
()
mean
=
np
.
array
(
col_length
)
.
mean
()
for
counter
,
length
in
enumerate
(
col_length
):
if
length
<
mean
-
2
*
std
or
length
>
mean
+
2
*
std
:
# length_minor_dict[valid_index[counter]]=col
index
=
valid_index
[
counter
]
if
index
in
length_minor_dict
.
keys
():
length_minor_dict
[
index
]
+=
' '
+
col
elif
index
not
in
length_minor_dict
.
keys
():
length_minor_dict
[
index
]
=
col
if
col_length
:
std
=
np
.
array
(
col_length
)
.
std
()
mean
=
np
.
array
(
col_length
)
.
mean
()
for
counter
,
length
in
enumerate
(
col_length
):
if
length
<
mean
-
2
*
std
or
length
>
mean
+
2
*
std
:
# length_minor_dict[valid_index[counter]]=col
index
=
valid_index
[
counter
]
if
index
in
length_minor_dict
.
keys
():
length_minor_dict
[
index
]
+=
' '
+
col
elif
index
not
in
length_minor_dict
.
keys
():
length_minor_dict
[
index
]
=
col
length_minor
=
[]
length_minor
.
extend
(
length_minor_dict
.
keys
())
# 检测产品参数列数据格式小于总数量的b的产品
format_minor_dict
=
{}
for
col
in
df
.
columns
[
7
:
-
2
]:
counter_dict
=
{}
valid_index
=
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
for
i
in
valid_index
:
counter_list
=
[]
k
=
df
.
loc
[
i
,
col
]
...
...
@@ -161,7 +184,12 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
for
col_i
in
df
.
columns
[
c_list
]:
tempo_list
=
[]
tempo_list2
=
[]
valid_df
=
df
.
loc
[
df
.
index
[(
df
[
col_i
]
!=
'暂无数据'
)
==
((
df
[
col_i
]
!=
'无参数,需补充'
)
==
(
df
[
col_i
]
.
notnull
()))]]
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col_i
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
valid_df
=
df
.
loc
[
valid_index
]
cha_num
=
valid_df
.
groupby
(
by
=
col_i
)[
'产品编码'
]
.
count
()
num_list
=
[
x
for
x
in
cha_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
b
]
for
i
in
cha_num
.
index
:
...
...
公共代码/产品重复型号分析.py
View file @
d54af336
...
...
@@ -13,6 +13,8 @@ import xlsxwriter
def
product_washing
(
filepath
,
thre
=
1
,
a
=
0
):
df_null
=
pd
.
read_excel
(
".
\\
异常数据表格.xlsx"
)
invalid_list
=
df_null
[
'异常数据名称'
]
.
values
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
df
.
drop
(
columns
=
'Unnamed: 0'
,
axis
=
1
,
inplace
=
True
)
for
col
in
df
.
columns
:
...
...
@@ -21,13 +23,13 @@ def product_washing(filepath, thre=1, a=0):
related_product
=
[]
brand_grouped
=
df
.
groupby
(
by
=
'产品品牌'
)
for
brand
in
brand_grouped
:
if
brand
[
0
]
==
'无参数,需补充'
:
if
brand
[
0
]
in
invalid_list
:
continue
tempo_dict
=
{}
# 每个品牌提取产品型号关键字,放入tempo_dict
for
i
in
brand
[
1
]
.
index
:
k
=
brand
[
1
]
.
loc
[
i
,
'产品型号'
]
if
k
==
'无参数,需补充'
:
if
k
in
invalid_list
:
continue
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
if
pre_num
:
...
...
@@ -56,6 +58,8 @@ def product_washing(filepath, thre=1, a=0):
pos_num
+=
'+'
combined
=
pre_num
+
num
+
pos_num
# 将关键字列表合并
while
''
in
combined
:
combined
.
remove
(
''
)
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
...
...
@@ -74,30 +78,29 @@ def product_washing(filepath, thre=1, a=0):
#
# combined = pre_num + num + pos_num # 将关键字列表合并
# 提取品牌名关键字
temp_list1
=
re
.
findall
(
r'([\u4e00-\u9fa5]+)'
,
brand
[
0
]
.
upper
())
# 提取汉字
while
'新建品牌'
in
temp_list1
:
temp_list1
.
remove
(
'新建品牌'
)
# 去除‘新建品牌’
temp_list2
=
re
.
findall
(
r'[A-Za-z]+\W?[A-Za-z]+'
,
brand
[
0
]
.
upper
())
# 提取英文单词
brand_combined
=
temp_list1
+
temp_list2
# # 提取品牌名关键字
# temp_list1 = re.findall(r'([\u4e00-\u9fa5]+)', brand[0].upper()) # 提取汉字
# while '新建品牌' in temp_list1:
# temp_list1.remove('新建品牌') # 去除‘新建品牌’
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
while
''
in
combined
:
combined
.
remove
(
''
)
tempo_dict
[
i
]
=
[
set
(
brand_combined
),
set
(
combined
)
]
tempo_dict
[
i
]
=
[
set
(
combined
),
brand
[
1
]
.
loc
[
i
,
'*质保时间'
],
brand
[
1
]
.
loc
[
i
,
'标配外服务及配件'
]
]
# 对比产品型号关键字,相同则放入related_product
tested_product
=
[]
for
i
in
tempo_dict
:
for
j
in
tempo_dict
:
if
i
!=
j
and
set
([
i
,
j
])
not
in
tested_product
:
if
tempo_dict
[
i
][
0
]
==
tempo_dict
[
j
][
0
]:
if
tempo_dict
[
i
][
1
:]
==
tempo_dict
[
j
][
1
:
]:
accuracy_i
=
0
accuracy_j
=
0
for
word_i
in
tempo_dict
[
i
][
1
]:
if
word_i
in
tempo_dict
[
j
][
1
]:
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
1
])
+
a
/
len
(
tempo_dict
[
i
][
1
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
1
])
+
a
/
len
(
tempo_dict
[
j
][
1
]))
for
word_i
in
tempo_dict
[
i
][
0
]:
if
word_i
in
tempo_dict
[
j
][
0
]:
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
0
])
+
a
/
len
(
tempo_dict
[
i
][
0
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
0
])
+
a
/
len
(
tempo_dict
[
j
][
0
]))
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
not
(
df
.
loc
[
i
,
'产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'产品型号'
]
.
endswith
(
'+'
)):
related_product
.
append
(
set
([
i
,
j
]))
...
...
公共代码/扫描仪产品品牌分析.xlsx
deleted
100644 → 0
View file @
a5316846
File deleted
公共代码/爬虫信息分析.py
View file @
d54af336
...
...
@@ -33,8 +33,8 @@ def pachong_washing(filepath):
if
num
:
num
=
num
.
group
(
1
)
# 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if
not
num
:
# 如果没有数字,就比较英文单词
num
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
num
=
''
.
join
(
num
)
alpha
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
alpha
=
''
.
join
(
alpha
)
pos_num
=
re
.
findall
(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?'
,
k
)
# pos_num为数字后的关键字
...
...
@@ -55,8 +55,8 @@ def pachong_washing(filepath):
comparing_df
.
loc
[
i
,
'爬取数据pre_num'
]
=
pre_num
if
num
:
comparing_df
.
loc
[
i
,
'爬取数据num'
]
=
num
#
if not num:
#
comparing_df.loc[i, '爬取数据alpha'] = alpha
if
not
num
:
comparing_df
.
loc
[
i
,
'爬取数据alpha'
]
=
alpha
comparing_df
.
loc
[
i
,
'爬取数据pos_num'
]
=
pos_num
for
i
in
df
.
index
:
...
...
@@ -73,8 +73,8 @@ def pachong_washing(filepath):
if
num
:
num
=
num
.
group
(
1
)
# 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if
not
num
:
# 如果没有数字,就比较英文单词
num
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
num
=
''
.
join
(
num
)
alpha
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
alpha
=
''
.
join
(
alpha
)
pos_num
=
re
.
findall
(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?'
,
...
...
@@ -106,10 +106,17 @@ def pachong_washing(filepath):
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
if
comparing_df
.
loc
[
i
,
'补充型号num'
]
!=
comparing_df
.
loc
[
i
,
'爬取数据num'
]:
for
col
in
df
.
columns
[
15
:
21
]:
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
if
comparing_df
.
loc
[
i
,
'补充型号num'
]
!=
comparing_df
.
loc
[
i
,
'爬取数据num'
]:
# 如果没有num,此处为nan, nan!=nan所以没问题
if
type
(
comparing_df
.
loc
[
i
,
'补充型号num'
])
!=
float
:
for
col
in
df
.
columns
[
15
:
21
]:
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
else
:
if
comparing_df
.
loc
[
i
,
'补充型号alpha'
]
not
in
comparing_df
.
loc
[
i
,
'爬取数据alpha'
]:
for
col
in
df
.
columns
[
15
:
21
]:
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
if
comparing_df
.
loc
[
i
,
'补充型号pos_num'
]
!=
comparing_df
.
loc
[
i
,
'爬取数据pos_num'
]:
...
...
@@ -117,7 +124,7 @@ def pachong_washing(filepath):
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
df
.
to_excel
(
'./
after_lijie
.xlsx'
)
df
.
to_excel
(
'./
爬取数据分析
.xlsx'
)
if
__name__
==
'__main__'
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment