Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
e6658a7a
Commit
e6658a7a
authored
Apr 01, 2021
by
Jialin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
李佳林
parent
ed7d80ba
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
492 additions
and
0 deletions
+492
-0
爬虫信息分析.py
公共代码/爬虫信息分析.py
+126
-0
错误产品分析.py
公共代码/错误产品分析.py
+153
-0
错误品牌分析.py
公共代码/错误品牌分析.py
+0
-0
错误类别分析.py
公共代码/错误类别分析.py
+213
-0
No files found.
公共代码/爬虫信息分析.py
0 → 100644
View file @
e6658a7a
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: after_lijie.py
@time: 2021/03/31
@desc:
"""
import
pandas
as
pd
import
numpy
as
np
import
re
import
xlsxwriter
def
pachong_washing
(
filepath
):
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
#df.drop(columns='Unnamed: 0', axis=1, inplace=True)
for
col
in
[
'补充后型号'
,
'爬取名称'
]:
df
[
col
]
=
df
[
col
]
.
astype
(
str
)
# 爬取名称关键词提取
comparing_df
=
pd
.
DataFrame
()
for
i
in
df
.
index
:
df
.
loc
[
i
,
'爬取名称'
]
=
df
.
loc
[
i
,
'爬取名称'
]
.
upper
()
k
=
df
.
loc
[
i
,
'爬取名称'
]
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
if
pre_num
:
pre_num
=
pre_num
.
group
(
1
)
if
not
pre_num
:
pre_num
=
''
num
=
re
.
search
(
r'(\d+)'
,
k
)
# num为数字关键字
if
num
:
num
=
num
.
group
(
1
)
# 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if
not
num
:
# 如果没有数字,就比较英文单词
num
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
num
=
''
.
join
(
num
)
pos_num
=
re
.
findall
(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?'
,
k
)
# pos_num为数字后的关键字
if
pos_num
:
pos_num
=
''
.
join
(
pos_num
[
0
])
if
'升级版'
in
k
:
pos_num
+=
'升级版'
if
'专业版'
in
k
:
pos_num
+=
'专业版'
if
'教育版'
in
k
:
pos_num
+=
'教育版'
if
'+'
in
k
:
pos_num
+=
'+'
if
not
pos_num
:
pos_num
=
''
comparing_df
.
loc
[
i
,
'爬取数据pre_num'
]
=
pre_num
if
num
:
comparing_df
.
loc
[
i
,
'爬取数据num'
]
=
num
# if not num:
# comparing_df.loc[i, '爬取数据alpha'] = alpha
comparing_df
.
loc
[
i
,
'爬取数据pos_num'
]
=
pos_num
for
i
in
df
.
index
:
df
.
loc
[
i
,
'补充后型号'
]
=
df
.
loc
[
i
,
'补充后型号'
]
.
upper
()
k
=
df
.
loc
[
i
,
'补充后型号'
]
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
if
pre_num
:
pre_num
=
pre_num
.
group
(
1
)
if
not
pre_num
:
pre_num
=
''
num
=
re
.
search
(
r'(\d+)'
,
k
)
# num为数字关键字
if
num
:
num
=
num
.
group
(
1
)
# 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if
not
num
:
# 如果没有数字,就比较英文单词
num
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
num
=
''
.
join
(
num
)
pos_num
=
re
.
findall
(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?'
,
k
)
# pos_num为数字后的关键字
if
pos_num
:
pos_num
=
''
.
join
(
pos_num
[
0
])
if
'升级版'
in
k
:
pos_num
+=
'升级版'
if
'专业版'
in
k
:
pos_num
+=
'专业版'
if
'教育版'
in
k
:
pos_num
+=
'教育版'
if
'+'
in
k
:
pos_num
+=
'+'
if
not
pos_num
:
pos_num
=
''
comparing_df
.
loc
[
i
,
'补充型号pre_num'
]
=
pre_num
if
num
:
comparing_df
.
loc
[
i
,
'补充型号num'
]
=
num
if
not
num
:
comparing_df
.
loc
[
i
,
'补充型号alpha'
]
=
alpha
comparing_df
.
loc
[
i
,
'补充型号pos_num'
]
=
pos_num
for
i
in
df
.
index
:
if
comparing_df
.
loc
[
i
,
'补充型号pre_num'
]
!=
''
and
comparing_df
.
loc
[
i
,
'爬取数据pre_num'
]
!=
''
:
if
comparing_df
.
loc
[
i
,
'补充型号pre_num'
]
!=
comparing_df
.
loc
[
i
,
'爬取数据pre_num'
]:
for
col
in
df
.
columns
[
15
:
21
]:
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
if
comparing_df
.
loc
[
i
,
'补充型号num'
]
!=
comparing_df
.
loc
[
i
,
'爬取数据num'
]:
for
col
in
df
.
columns
[
15
:
21
]:
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
if
comparing_df
.
loc
[
i
,
'补充型号pos_num'
]
!=
comparing_df
.
loc
[
i
,
'爬取数据pos_num'
]:
for
col
in
df
.
columns
[
15
:
21
]:
df
.
loc
[
i
,
col
]
=
'暂无数据'
continue
df
.
to_excel
(
'./after_lijie.xlsx'
)
if
__name__
==
'__main__'
:
filepath
=
"E:
\\
ZDZC
\\
扫描仪参数确认(爬虫).xlsx"
pachong_washing
(
filepath
)
\ No newline at end of file
公共代码/错误产品分析.py
0 → 100644
View file @
e6658a7a
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: product_filter.py
@time: 2021/03/29
@desc:
"""
import
pandas
as
pd
import
re
import
numpy
as
np
import
xlsxwriter
def
product_washing
(
filepath
,
thre
=
1
,
a
=
0
):
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
df
.
drop
(
columns
=
'Unnamed: 0'
,
axis
=
1
,
inplace
=
True
)
for
col
in
df
.
columns
:
df
[
col
]
=
df
[
col
]
.
astype
(
str
)
related_product
=
[]
brand_grouped
=
df
.
groupby
(
by
=
'产品品牌'
)
for
brand
in
brand_grouped
:
if
brand
[
0
]
==
'无参数,需补充'
:
continue
tempo_dict
=
{}
# 每个品牌提取产品型号关键字,放入tempo_dict
for
i
in
brand
[
1
]
.
index
:
k
=
brand
[
1
]
.
loc
[
i
,
'产品型号'
]
if
k
==
'无参数,需补充'
:
continue
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
if
pre_num
:
pre_num
=
[
pre_num
.
group
(
1
)]
if
not
pre_num
:
pre_num
=
[]
num
=
re
.
search
(
r'(\d+)'
,
k
)
# num为数字关键字
if
num
:
num
=
[
num
.
group
(
1
)]
# 如果连续数字超过1处,我们只提取第一处,这个可能有点问题但目前还不知道怎么办
if
not
num
:
# 如果没有数字,就比较英文单词
num
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
pos_num
=
re
.
findall
(
r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?'
,
k
)
# pos_num为数字后的关键字
if
pos_num
:
pos_num
=
list
(
pos_num
[
0
])
if
'升级版'
in
k
:
pos_num
+=
'升级版'
if
'专业版'
in
k
:
pos_num
+=
'专业版'
if
'教育版'
in
k
:
pos_num
+=
'教育版'
if
'+'
in
k
:
pos_num
+=
'+'
combined
=
pre_num
+
num
+
pos_num
# 将关键字列表合并
# pre_num = re.findall(r'^([A-Za-z]{0,4})\W?\d+', k) # pre_num为数字前的关键字
# if not pre_num:
# pre_num = re.findall(r'[\u4e00-\u9fa5]+([A-Za-z]{0,4})\W?\d+', k)
# if not pre_num:
# pre_num = re.findall(r'\b([A-Za-z]{0,4})\W?\d+', k)
# if pre_num:
# pre_num = [pre_num[0]]
#
# num = re.findall(r'\d+', k) # num为数字关键字
# if num:
# num = [num[0]]
# pos_num = re.findall(r'\d+([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?\W?([0-9]+)?\W?([A-Za-z]+)?',
# k) # pos_num为数字后的关键字
# if pos_num:
# pos_num = list(pos_num[0])
#
# combined = pre_num + num + pos_num # 将关键字列表合并
# 提取品牌名关键字
temp_list1
=
re
.
findall
(
r'([\u4e00-\u9fa5]+)'
,
brand
[
0
]
.
upper
())
# 提取汉字
while
'新建品牌'
in
temp_list1
:
temp_list1
.
remove
(
'新建品牌'
)
# 去除‘新建品牌’
temp_list2
=
re
.
findall
(
r'[A-Za-z]+\W?[A-Za-z]+'
,
brand
[
0
]
.
upper
())
# 提取英文单词
brand_combined
=
temp_list1
+
temp_list2
while
''
in
combined
:
combined
.
remove
(
''
)
tempo_dict
[
i
]
=
[
set
(
brand_combined
),
set
(
combined
)]
# 对比产品型号关键字,相同则放入related_product
tested_product
=
[]
for
i
in
tempo_dict
:
for
j
in
tempo_dict
:
if
i
!=
j
and
set
([
i
,
j
])
not
in
tested_product
:
if
tempo_dict
[
i
][
0
]
==
tempo_dict
[
j
][
0
]:
accuracy_i
=
0
accuracy_j
=
0
for
word_i
in
tempo_dict
[
i
][
1
]:
if
word_i
in
tempo_dict
[
j
][
1
]:
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
1
])
+
a
/
len
(
tempo_dict
[
i
][
1
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
1
])
+
a
/
len
(
tempo_dict
[
j
][
1
]))
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
not
(
df
.
loc
[
i
,
'产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'产品型号'
]
.
endswith
(
'+'
)):
related_product
.
append
(
set
([
i
,
j
]))
tested_product
.
append
(
set
([
i
,
j
]))
# a = set([i])
# for j in tempo_dict:
# if tempo_dict[i] == tempo_dict[j]:
# a.add(j)
# if len(a) > 1:
# related_product.append(a)
# for i in tempo_dict:
# for j in tempo_dict:
# if i != j:
# if tempo_dict[i] == tempo_dict[j]:
# related_product.append(set([i, j]))
# # 这一步为去重
# product_unique=[]
# for item in related_product:
# if item not in product_unique:
# product_unique.append(item)
# 将数据导出到excel表格。重复的产品数据两两并列 和其他的重复数据组中间空一行
workbook
=
xlsxwriter
.
Workbook
(
'./product_filter.xlsx'
)
bold_format
=
workbook
.
add_format
({
'bold'
:
True
})
worksheet
=
workbook
.
add_worksheet
()
col
=
0
row
=
0
worksheet
.
write_string
(
row
,
col
,
'Index'
,
bold_format
)
col
+=
1
for
column
in
df
.
columns
:
worksheet
.
write_string
(
row
,
col
,
column
,
bold_format
)
col
+=
1
worksheet
.
write_string
(
row
,
col
,
'正确产品编号'
,
bold_format
)
row
=
1
col
=
0
for
item
in
related_product
:
for
inner_item
in
item
:
worksheet
.
write_string
(
row
,
col
,
str
(
inner_item
))
for
value
in
df
.
loc
[
inner_item
]
.
values
:
col
+=
1
worksheet
.
write_string
(
row
,
col
,
value
)
col
=
0
row
+=
1
row
+=
1
workbook
.
close
()
if
__name__
==
'__main__'
:
filepath
=
"E:
\\
ZDZC
\\
激光打印机参数确认.xlsx"
product_washing
(
filepath
)
公共代码/错误品牌分析.py
0 → 100644
View file @
e6658a7a
This diff is collapsed.
Click to expand it.
公共代码/错误类别分析.py
0 → 100644
View file @
e6658a7a
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author: dell
@file: class_washing.py
@time: 2021/03/26
@desc:
"""
import
pandas
as
pd
import
re
import
numpy
as
np
# category为产品类型
# a是同一品牌或父品牌下产品数量占产品总数量的百分比,作为阈值,a越大,有异常的产品越多;b和a一样,只是用于产品数据类型和参数
# c_list是产品参数中,数据类型较为统一的参数 在excel列名中的位置,从0开始,必须是一个list
def
class_washing
(
category
,
filepath
,
c_list
,
a
=
0.02
,
b
=
0.01
):
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
df
.
drop
(
columns
=
'Unnamed: 0'
,
axis
=
1
,
inplace
=
True
)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
dtype_minor
=
[]
for
col
in
df
.
columns
:
type_list
=
{}
valid_index
=
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]
for
i
in
valid_index
:
data_type
=
type
(
df
.
loc
[
i
,
col
])
if
data_type
not
in
type_list
:
type_list
[
data_type
]
=
1
elif
data_type
in
type_list
:
type_list
[
data_type
]
+=
1
for
data_type_i
in
type_list
:
if
type_list
[
data_type_i
]
<
len
(
valid_index
)
*
b
:
for
i
in
valid_index
:
if
type
(
df
.
loc
[
i
][
col
])
==
data_type_i
:
dtype_minor
.
append
(
i
)
# 在检测完产品数据类型后,将所有数据类型转换为string
for
col
in
df
.
columns
:
df
[
col
]
=
df
[
col
]
.
astype
(
str
)
# 检测产品类型错误的产品,和产品名称中不带有产品类型的产品。由于代码简单,就放在一起了
wrong_class
=
[]
not_in_name
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
'产品类别'
]
!=
category
:
wrong_class
.
append
(
i
)
if
category
not
in
df
.
loc
[
i
,
'产品名称'
]:
not_in_name
.
append
(
i
)
# 检测产品父品牌中品牌出现次数小于产品总数的a的产品
father_brand_minor
=
[]
father_brand_list
=
[]
col
=
'产品父品牌'
valid_df
=
df
.
loc
[
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]]
father_brand_num
=
valid_df
.
groupby
(
by
=
'产品父品牌'
)[
'产品编码'
]
.
count
()
# 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
father_num_list
=
[
x
for
x
in
father_brand_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
a
]
for
i
in
father_brand_num
.
index
:
# i 就是产品父品牌
if
father_brand_num
.
loc
[
i
]
in
father_num_list
:
# father_brand_num.loc[i] 就是该父品牌出现次数
father_brand_list
.
append
(
i
)
for
i
in
valid_df
.
index
:
if
valid_df
.
loc
[
i
,
'产品父品牌'
]
in
father_brand_list
:
father_brand_minor
.
append
(
i
)
# 检测产品品牌中品牌出现次数少的产品
brand_minor
=
[]
brand_list
=
[]
col
=
'产品品牌'
valid_df
=
df
.
loc
[
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]]
brand_num
=
valid_df
.
groupby
(
by
=
'产品品牌'
)[
'产品编码'
]
.
count
()
# 同上
num_list
=
[
x
for
x
in
brand_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
a
]
for
i
in
brand_num
.
index
:
if
brand_num
.
loc
[
i
]
in
num_list
:
brand_list
.
append
(
i
)
for
i
in
valid_df
.
index
:
if
valid_df
.
loc
[
i
,
'产品品牌'
]
in
brand_list
:
brand_minor
.
append
(
i
)
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品,7是第一个产品参数列,-2是质保时间,-1是产品型号
length_minor
=
[]
for
col
in
df
.
columns
[
7
:
-
2
]:
col_length
=
[]
valid_index
=
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]
for
i
in
valid_index
:
col_length
.
append
(
len
(
df
.
loc
[
i
,
col
]))
std
=
np
.
array
(
col_length
)
.
std
()
mean
=
np
.
array
(
col_length
)
.
mean
()
for
counter
,
length
in
enumerate
(
col_length
):
if
length
<
mean
-
2
*
std
or
length
>
mean
+
2
*
std
:
length_minor
.
append
(
valid_index
[
counter
])
# 检测产品参数列数据格式小于总数量的b的产品
format_minor
=
[]
for
col
in
df
.
columns
[
7
:
-
2
]:
counter_dict
=
{}
valid_index
=
df
.
index
[(
df
[
col
]
!=
'暂无数据'
)
==
((
df
[
col
]
!=
'无参数,需补充'
)
==
(
df
[
col
]
.
notnull
()))]
for
i
in
valid_index
:
counter_list
=
[]
k
=
df
.
loc
[
i
,
col
]
is_str
=
re
.
findall
(
r'[A-Za-z]+'
,
k
)
is_dig
=
re
.
findall
(
r'[0-9]+'
,
k
)
is_special
=
re
.
findall
(
r'\W+'
,
k
)
is_chinese
=
re
.
findall
(
r'[\u4e00-\u9fa5]+'
,
k
)
if
is_str
:
counter_list
.
append
(
'str'
)
if
is_dig
:
counter_list
.
append
(
'dig'
)
if
is_special
:
counter_list
.
append
(
'special'
)
if
is_chinese
:
counter_list
.
append
(
'chinese'
)
combined
=
''
.
join
(
counter_list
)
if
combined
not
in
counter_dict
:
counter_dict
[
combined
]
=
[
i
]
elif
combined
in
counter_dict
:
counter_dict
[
combined
]
.
append
(
i
)
for
keys
in
counter_dict
:
if
len
(
counter_dict
[
keys
])
<
len
(
valid_index
)
*
b
:
format_minor
.
extend
(
counter_dict
[
keys
])
# length_record = []
# for keys in counter_dict:
# if not length_record:
# length_record.append([len(counter_dict[keys]), counter_dict[keys]])
# elif len(counter_dict[keys]) < length_record[0][0]:
# length_record[0] = [len(counter_dict[keys]), counter_dict[keys]]
#
# format_minor += length_record[0][1]
# 接下来是针对扫描仪的部分.对于特定产品,还可以从每个产品参数中选出少数派。如果知道易混淆的产品类型,还要特意加上挑取易混淆产品类型的代码
# 对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
not_in_name2
=
[]
for
i
in
not_in_name
:
if
'高拍仪'
not
in
df
.
loc
[
i
,
'产品名称'
]:
not_in_name2
.
append
(
i
)
# 对于产品参数中,数据类型较少的参数,其中如果有数量小于产品总数量的b的,挑出来
character_minor_dict
=
{}
for
col_i
in
df
.
columns
[
c_list
]:
tempo_list
=
[]
tempo_list2
=
[]
valid_df
=
df
.
loc
[
df
.
index
[(
df
[
col_i
]
!=
'暂无数据'
)
==
((
df
[
col_i
]
!=
'无参数,需补充'
)
==
(
df
[
col_i
]
.
notnull
()))]]
cha_num
=
valid_df
.
groupby
(
by
=
col_i
)[
'产品编码'
]
.
count
()
num_list
=
[
x
for
x
in
cha_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
b
]
for
i
in
cha_num
.
index
:
if
cha_num
.
loc
[
i
]
in
num_list
:
tempo_list
.
append
(
i
)
for
i
in
valid_df
.
index
:
if
valid_df
.
loc
[
i
,
col_i
]
in
tempo_list
:
tempo_list2
.
append
(
i
)
character_minor_dict
[
col_i
]
=
tempo_list2
character_minor
=
[]
for
keys_i
in
character_minor_dict
:
character_minor
.
extend
(
character_minor_dict
[
keys_i
])
# 将挑出的可疑数据整合到一起 (wrong_class没加,因为里面的肯定不对)
index_minor
=
[]
index_minor
.
extend
(
wrong_class
)
index_minor
.
extend
(
format_minor
)
index_minor
.
extend
(
length_minor
)
index_minor
.
extend
(
brand_minor
)
index_minor
.
extend
(
father_brand_minor
)
index_minor
.
extend
(
not_in_name2
)
index_minor
.
extend
(
dtype_minor
)
index_minor
.
extend
(
character_minor
)
index_minor
=
set
(
index_minor
)
final_df
=
pd
.
DataFrame
(
np
.
zeros
((
len
(
index_minor
),
8
)),
index
=
list
(
index_minor
),
columns
=
[
'计数'
,
'产品类型异常'
,
'产品名称异常'
,
'父品牌异常'
,
'品牌异常'
,
'数据类型异常'
,
'数据格式异常'
,
'数据长度异常'
])
for
i
in
index_minor
:
count
=
0
if
i
in
wrong_class
:
count
+=
1
final_df
.
loc
[
i
,
'产品类型异常'
]
=
1
if
i
in
format_minor
:
count
+=
1
final_df
.
loc
[
i
,
'数据格式异常'
]
=
1
if
i
in
length_minor
:
count
+=
1
final_df
.
loc
[
i
,
'数据长度异常'
]
=
1
if
i
in
brand_minor
:
count
+=
1
final_df
.
loc
[
i
,
'品牌异常'
]
=
1
if
i
in
father_brand_minor
:
count
+=
1
final_df
.
loc
[
i
,
'父品牌异常'
]
=
1
if
i
in
not_in_name2
:
count
+=
1
final_df
.
loc
[
i
,
'产品名称异常'
]
=
1
if
i
in
dtype_minor
:
count
+=
1
final_df
.
loc
[
i
,
'数据类型异常'
]
=
1
for
keys_i
in
character_minor_dict
:
if
i
in
character_minor_dict
[
keys_i
]:
final_df
.
loc
[
i
,
keys_i
+
'异常'
]
=
1
count
+=
1
else
:
final_df
.
loc
[
i
,
keys_i
+
'异常'
]
=
0
final_df
.
loc
[
i
,
'计数'
]
=
count
final_df
=
pd
.
merge
(
final_df
,
df
,
how
=
'left'
,
left_index
=
True
,
right_index
=
True
)
final_df
=
final_df
.
sort_values
(
by
=
'计数'
,
ascending
=
False
)
final_df
.
to_excel
(
"./class_filter.xlsx"
)
if
__name__
==
'__main__'
:
category
=
'激光打印机'
filepath
=
"E:
\\
ZDZC
\\
激光打印机参数确认.xlsx"
c_list
=
[
6
,
7
,
-
4
,
-
3
]
class_washing
(
category
,
filepath
,
c_list
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment