Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
c040d93d
Commit
c040d93d
authored
Apr 14, 2021
by
Jialin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代码最终修改
parent
d54af336
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
258 additions
and
127 deletions
+258
-127
产品品牌分析.py
公共代码/产品品牌分析.py
+56
-25
产品类别分析.py
公共代码/产品类别分析.py
+176
-86
产品重复型号分析.py
公共代码/产品重复型号分析.py
+26
-16
异常数据表格.xlsx
公共代码/异常数据表格.xlsx
+0
-0
激光打印机参数确认.xlsx
公共代码/激光打印机参数确认.xlsx
+0
-0
No files found.
公共代码/产品品牌分析.py
View file @
c040d93d
...
...
@@ -5,6 +5,7 @@
import
pandas
as
pd
import
re
import
xlsxwriter
import
numpy
as
np
def
brand_washing
(
filepath
,
thre
=
0.5
,
inner_thre
=
0.5
,
a
=
1
,
sheet_name
=
0
):
...
...
@@ -15,7 +16,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 处理缺失值
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
'产品品牌'
]
not
in
invalid_list
and
df
.
loc
[
i
,
'产品型号'
]
not
in
invalid_list
:
if
df
.
loc
[
i
,
'产品品牌'
]
not
in
invalid_list
and
df
.
loc
[
i
,
'
*
产品型号'
]
not
in
invalid_list
:
valid_index
.
append
(
i
)
result
=
df
.
loc
[
valid_index
]
# 将df数据格式转为字符串
...
...
@@ -27,7 +28,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 将df按品牌分类,按品牌提取每个型号的关键字,放入字典
brand_type
=
result
.
groupby
(
'产品品牌'
)
for
brand
in
brand_type
:
result_unique
=
brand
[
1
][
'产品型号'
]
.
unique
()
# result_unique此时是array,元素是一个品牌名下的型号
result_unique
=
brand
[
1
][
'
*
产品型号'
]
.
unique
()
# result_unique此时是array,元素是一个品牌名下的型号
for
j
in
range
(
len
(
result_unique
)):
result_unique
[
j
]
=
result_unique
[
j
]
.
upper
()
.
strip
()
result_unique
=
pd
.
DataFrame
(
result_unique
)[
0
]
.
unique
()
# 全部变为大写,将大小写归一,result_unique此时是array
...
...
@@ -132,7 +133,7 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
# 第三种方法,对比品牌名和另一个品牌型号关键字,如果有一个重合,就算作两个品牌相似
type_kw
=
{}
# 用于接收品牌型号提取的关键字
for
i
in
brand_type
:
result_unique
=
i
[
1
][
'产品型号'
]
.
unique
()
# 品牌型号组成的数组,数组内无重复元素
result_unique
=
i
[
1
][
'
*
产品型号'
]
.
unique
()
# 品牌型号组成的数组,数组内无重复元素
for
k
in
range
(
len
(
result_unique
)):
result_unique
[
k
]
=
result_unique
[
k
]
.
upper
()
.
strip
()
result_unique
=
pd
.
DataFrame
(
result_unique
)[
0
]
.
unique
()
...
...
@@ -156,6 +157,36 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
related_brand3
.
append
(
tempo_list
)
#写入excel
tempo_list
=
[]
# 将所有的相似品牌,两两一对,写入集合,放入tempo_list
method1
=
[]
method2
=
[]
method3
=
[]
related_brand_list
=
[
related_brand1
,
related_brand2
,
related_brand3
]
method_list
=
[
method1
,
method2
,
method3
]
for
i
in
range
(
len
(
related_brand_list
)):
for
list_i
in
related_brand_list
[
i
]:
tempo_list
.
append
(
set
(
list_i
[:
2
]))
method_list
[
i
]
.
append
(
set
(
list_i
[:
2
]))
final_list
=
[]
# final_list就是tempo_list的去重
for
item
in
tempo_list
:
if
item
not
in
final_list
:
final_list
.
append
(
item
)
method
=
[]
for
item
in
final_list
:
linshi_list
=
[]
if
item
in
method1
:
linshi_list
.
append
(
'1'
)
if
item
in
method2
:
linshi_list
.
append
(
'2'
)
if
item
in
method3
:
linshi_list
.
append
(
'3'
)
method
.
append
(
','
.
join
(
linshi_list
))
workbook
=
xlsxwriter
.
Workbook
(
'./brand_filter.xlsx'
)
bold_format
=
workbook
.
add_format
({
'bold'
:
True
})
...
...
@@ -167,32 +198,32 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
worksheet
.
write
(
'D1'
,
'方法'
,
bold_format
)
col
=
0
row
=
1
for
list_i
in
related_brand1
:
for
brand_i
in
range
(
2
)
:
worksheet
.
write_string
(
row
,
col
,
list_i
[
brand_i
]
)
for
index
in
range
(
len
(
final_list
))
:
for
brand_i
in
final_list
[
index
]
:
worksheet
.
write_string
(
row
,
col
,
brand_i
)
col
+=
1
col
=
3
worksheet
.
write_string
(
row
,
col
,
'1'
)
worksheet
.
write_string
(
row
,
col
,
method
[
index
]
)
row
+=
1
col
=
0
for
list_i
in
related_brand2
:
for
brand_i
in
range
(
2
):
worksheet
.
write_string
(
row
,
col
,
list_i
[
brand_i
])
col
+=
1
col
=
3
worksheet
.
write_string
(
row
,
col
,
'2'
)
row
+=
1
col
=
0
for
list_i
in
related_brand3
:
for
brand_i
in
range
(
2
):
worksheet
.
write_string
(
row
,
col
,
list_i
[
brand_i
])
col
+=
1
col
=
3
worksheet
.
write_string
(
row
,
col
,
'3'
)
row
+=
1
col
=
0
#
for list_i in related_brand2:
#
for brand_i in range(2):
#
worksheet.write_string(row, col, list_i[brand_i])
#
col += 1
#
col = 3
#
worksheet.write_string(row, col, '2')
#
row += 1
#
col = 0
#
#
for list_i in related_brand3:
#
for brand_i in range(2):
#
worksheet.write_string(row, col, list_i[brand_i])
#
col += 1
#
col = 3
#
worksheet.write_string(row, col, '3')
#
row += 1
#
col = 0
# 第二个worksheet
worksheet2
=
workbook
.
add_worksheet
(
name
=
'Sheet2'
)
...
...
@@ -239,6 +270,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
workbook
.
close
()
if
__name__
==
'__main__'
:
filepath
=
'E:
\\
ZDZC
\\
激光打印机
参数确认.xlsx'
filepath
=
'E:
\\
ZDZC
\\
扫描仪
参数确认.xlsx'
brand_washing
(
filepath
)
公共代码/产品类别分析.py
View file @
c040d93d
...
...
@@ -9,20 +9,25 @@
import
pandas
as
pd
import
re
import
numpy
as
np
import
pymssql
import
time
# category为产品类型
# a是同一品牌或父品牌下产品数量占产品总数量的百分比,作为阈值,a越大,有异常的产品越多;b和a一样,只是用于产品数据类型和参数
# c_list是产品参数中,数据类型较为统一的参数 在excel列名中的位置,从0开始,必须是一个list
def
class_washing
(
category
,
filepath
,
c_list
,
a
=
0.02
,
b
=
0.01
):
def
class_washing
(
category
,
filepath
,
b
=
0.01
):
df_null
=
pd
.
read_excel
(
".
\\
异常数据表格.xlsx"
)
invalid_list
=
df_null
[
'异常数据名称'
]
.
values
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
df
.
drop
(
columns
=
'Unnamed: 0'
,
axis
=
1
,
inplace
=
True
)
# 检测每列数据中 同一数据类型的产品数量少于产品总数量的b 的产品index.(可优化成字典形式存储读取)
t1
=
time
.
time
()
print
(
'开始检测数据类型'
)
dtype_minor_dict
=
{}
for
col
in
df
.
column
s
:
for
col
in
df
_null
[
'数据类型异常'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'数据类型异常'
]
.
notnull
()]
.
value
s
:
type_list
=
{}
valid_index
=
[]
for
i
in
df
.
index
:
...
...
@@ -51,6 +56,9 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
df
[
col
]
=
df
[
col
]
.
astype
(
str
)
# 检测产品类型错误的产品,和产品名称中不带有产品类型的产品。由于代码简单,就放在一起了
t2
=
time
.
time
()
print
(
t2
-
t1
)
print
(
'开始检测错误类别和错误名称'
)
wrong_class
=
[]
not_in_name
=
[]
for
i
in
df
.
index
:
...
...
@@ -59,49 +67,93 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
if
category
not
in
df
.
loc
[
i
,
'产品名称'
]:
not_in_name
.
append
(
i
)
# 检测产品父品牌中品牌出现次数小于产品总数的a的产品
father_brand_minor
=
[]
father_brand_list
=
[]
col
=
'产品父品牌'
valid_index
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
valid_df
=
df
.
loc
[
valid_index
]
father_brand_num
=
valid_df
.
groupby
(
by
=
'产品父品牌'
)[
'产品编码'
]
.
count
()
# 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
father_num_list
=
[
x
for
x
in
father_brand_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
a
]
for
i
in
father_brand_num
.
index
:
# i 就是产品父品牌
if
father_brand_num
.
loc
[
i
]
in
father_num_list
:
# father_brand_num.loc[i] 就是该父品牌出现次数
father_brand_list
.
append
(
i
)
for
i
in
valid_df
.
index
:
if
valid_df
.
loc
[
i
,
'产品父品牌'
]
in
father_brand_list
:
father_brand_minor
.
append
(
i
)
# 检测产品品牌中品牌出现次数少的产品
brand_minor
=
[]
brand_list
=
[]
col
=
'产品品牌'
valid_index
=
[]
# 检测品牌中是否有不在category下对应的brand_id的产品品牌
t3
=
time
.
time
()
print
(
t3
-
t2
)
print
(
'开始检测错误品牌'
)
conn_zi_new
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'zgcprice20200628'
,
database
=
'ZI_NEW'
,
autocommit
=
True
)
cursor_zi_new
=
conn_zi_new
.
cursor
()
cursor_zi_new
.
execute
(
f
"select id from p_category where name='{category}'"
)
category_id
=
cursor_zi_new
.
fetchone
()
if
not
category_id
:
print
(
'输入类别不在数据库中,请查证'
)
return
category_id
=
category_id
[
0
]
cursor_zi_new
.
execute
(
f
"select brandid from p_spu where categoryid={category_id}"
)
brand_id_fetchall
=
cursor_zi_new
.
fetchall
()
brand_id_list
=
[]
for
brand_tuple
in
brand_id_fetchall
:
brand_id_list
.
append
(
brand_tuple
[
0
])
brand_name_list
=
[]
for
brand_id
in
brand_id_list
:
cursor_zi_new
.
execute
(
f
"select name from p_brand where id={brand_id}"
)
brand_name_fetch
=
cursor_zi_new
.
fetchone
()
if
brand_name_fetch
:
brand_name_list
.
append
(
brand_name_fetch
[
0
]
.
strip
(
"'"
))
wrong_brand
=
[]
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col
]
in
invalid_list
:
continue
valid_index
.
append
(
i
)
valid_df
=
df
.
loc
[
valid_index
]
brand_num
=
valid_df
.
groupby
(
by
=
'产品品牌'
)[
'产品编码'
]
.
count
()
# 同上
num_list
=
[
x
for
x
in
brand_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
a
]
for
i
in
brand_num
.
index
:
if
brand_num
.
loc
[
i
]
in
num_list
:
brand_list
.
append
(
i
)
for
i
in
valid_df
.
index
:
if
valid_df
.
loc
[
i
,
'产品品牌'
]
in
brand_list
:
brand_minor
.
append
(
i
)
if
df
.
loc
[
i
][
'产品品牌'
]
not
in
brand_name_list
:
wrong_brand
.
append
(
i
)
# cursor_zi_new.execute(f"select id from p_brand where name='{df.loc[i]['产品品牌']}'")
# brand_id=cursor_zi_new.fetchone()
# if not brand_id:
# wrong_brand.append(i)
# continue
# brand_id=brand_id[0]
# if brand_id not in brand_id_list:
# wrong_brand.append(i)
# # 检测产品父品牌中品牌出现次数小于产品总数的a的产品
# father_brand_minor = []
# father_brand_list = []
# col='产品父品牌'
# valid_index=[]
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# father_brand_num = valid_df.groupby(by='产品父品牌')['产品编码'].count() # 之所以用产品编码来计数是因为产品编码肯定不会有缺失值
# father_num_list = [x for x in father_brand_num.unique() if x < len(valid_df.index)*a]
# for i in father_brand_num.index: # i 就是产品父品牌
# if father_brand_num.loc[i] in father_num_list: # father_brand_num.loc[i] 就是该父品牌出现次数
# father_brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品父品牌'] in father_brand_list:
# father_brand_minor.append(i)
#
# # 检测产品品牌中品牌出现次数少的产品
# brand_minor = []
# brand_list = []
# col = '产品品牌'
# valid_index = []
# for i in df.index:
# if df.loc[i, col] in invalid_list:
# continue
# valid_index.append(i)
# valid_df=df.loc[valid_index]
# brand_num = valid_df.groupby(by='产品品牌')['产品编码'].count() # 同上
# num_list = [x for x in brand_num.unique() if x < len(valid_df.index)*a]
# for i in brand_num.index:
# if brand_num.loc[i] in num_list:
# brand_list.append(i)
#
# for i in valid_df.index:
# if valid_df.loc[i, '产品品牌'] in brand_list:
# brand_minor.append(i)
# 检测产品参数列中数据长度离该列平均数据长度2*std之外的产品,7是第一个产品参数列,-2是质保时间,-1是产品型号
t4
=
time
.
time
()
print
(
t4
-
t3
)
print
(
'开始检测错误长度'
)
length_minor_dict
=
{}
for
col
in
df
.
columns
[
7
:
-
2
]
:
for
col
in
df
_null
[
'数据长度异常'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'数据长度异常'
]
.
notnull
()]
.
values
:
col_length
=
[]
valid_index
=
[]
for
i
in
df
.
index
:
...
...
@@ -122,9 +174,13 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
length_minor_dict
[
index
]
=
col
length_minor
=
[]
length_minor
.
extend
(
length_minor_dict
.
keys
())
# 检测产品参数列数据格式小于总数量的b的产品
t5
=
time
.
time
()
print
(
t5
-
t4
)
print
(
'开始检测错误数据格式'
)
format_minor_dict
=
{}
for
col
in
df
.
columns
[
7
:
-
2
]
:
for
col
in
df
_null
[
'数据格式异常'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'数据格式异常'
]
.
notnull
()]
.
values
:
counter_dict
=
{}
valid_index
=
[]
for
i
in
df
.
index
:
...
...
@@ -160,9 +216,8 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
elif
keys_index
not
in
format_minor_dict
.
keys
():
format_minor_dict
[
keys_index
]
=
col
format_minor
=
[]
format_minor
.
extend
(
format_minor_dict
.
keys
())
format_minor
=
[]
format_minor
.
extend
(
format_minor_dict
.
keys
())
# length_record = []
# for keys in counter_dict:
# if not length_record:
...
...
@@ -172,79 +227,114 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
#
# format_minor += length_record[0][1]
# 接下来是针对扫描仪的部分.对于特定产品,还可以从每个产品参数中选出少数派。如果知道易混淆的产品类型,还要特意加上挑取易混淆产品类型的代码
# 对于产品名称中没有“扫描仪”的,如果没有“高拍仪”就挑出来
not_in_name2
=
[]
for
i
in
not_in_name
:
if
'高拍仪'
not
in
df
.
loc
[
i
,
'产品名称'
]:
not_in_name2
.
append
(
i
)
for
special_name
in
df_null
[
'产品名称异常'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'产品名称异常'
]
.
notnull
()]
.
values
:
if
special_name
in
df
.
loc
[
i
,
'产品名称'
]:
break
not_in_name2
.
append
(
i
)
# 对于产品参数中,数据类型较少的参数,其中如果有数量小于产品总数量的b的,挑出来
# 对于标准产品参数中,如果有数据不在标准字典中的,挑出来
t6
=
time
.
time
()
print
(
t6
-
t5
)
print
(
'开始检测标准参数'
)
character_minor_dict
=
{}
for
col_i
in
df
.
columns
[
c_list
]:
tempo_list
=
[]
tempo_list2
=
[]
valid_index
=
[]
for
col_i
in
df_null
[
'标准参数异常'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'标准参数异常'
]
.
notnull
()]
.
values
:
temp_list
=
[]
cursor_zi_new
.
execute
(
f
"select stdvalue from ShuJuZiDian_Cfg where categoryname='{category}' and subtitle='{col_i.strip('*')}'"
)
standard_value_fetchall
=
cursor_zi_new
.
fetchall
()
if
not
standard_value_fetchall
:
print
(
f
"{col_i.strip('*')} 不在 ShuJuZiDian_Cfg,请检查。该参数项在此次运行中未被采用"
)
continue
standard_value
=
[]
for
std_tuple
in
standard_value_fetchall
:
standard_value
.
append
(
std_tuple
[
0
])
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
col_i
]
in
invalid_list
:
value_col
=
df
.
loc
[
i
,
col_i
]
if
value_col
in
invalid_list
:
continue
valid_index
.
append
(
i
)
valid_df
=
df
.
loc
[
valid_index
]
cha_num
=
valid_df
.
groupby
(
by
=
col_i
)[
'产品编码'
]
.
count
()
num_list
=
[
x
for
x
in
cha_num
.
unique
()
if
x
<
len
(
valid_df
.
index
)
*
b
]
for
i
in
cha_num
.
index
:
if
cha_num
.
loc
[
i
]
in
num_list
:
tempo_list
.
append
(
i
)
for
i
in
valid_df
.
index
:
if
valid_df
.
loc
[
i
,
col_i
]
in
tempo_list
:
tempo_list2
.
append
(
i
)
character_minor_dict
[
col_i
]
=
tempo_list2
if
value_col
not
in
standard_value
:
temp_list
.
append
(
i
)
character_minor_dict
[
col_i
]
=
temp_list
character_minor
=
[]
for
keys_i
in
character_minor_dict
:
character_minor
.
extend
(
character_minor_dict
[
keys_i
])
# 将挑出的可疑数据整合到一起 (wrong_class没加,因为里面的肯定不对)
# cha_num = valid_df.groupby(by=col_i)['产品编码'].count()
# num_list = [x for x in cha_num.unique() if x < len(valid_df.index)*b]
# for i in cha_num.index:
# if cha_num.loc[i] in num_list:
# tempo_list.append(i)
# for i in valid_df.index:
# if valid_df.loc[i, col_i] in tempo_list:
# tempo_list2.append(i)
# character_minor_dict[col_i] = tempo_list2
#
# character_minor = []
# for keys_i in character_minor_dict:
# character_minor.extend(character_minor_dict[keys_i])
t7
=
time
.
time
()
print
(
t7
-
t6
)
print
(
'开始整合数据'
)
# 将挑出的可疑数据整合到一起
index_minor
=
[]
index_minor
.
extend
(
wrong_class
)
index_minor
.
extend
(
format_minor
)
index_minor
.
extend
(
length_minor
)
index_minor
.
extend
(
brand_minor
)
index_minor
.
extend
(
father_brand_minor
)
index_minor
.
extend
(
wrong_brand
)
#
index_minor.extend(father_brand_minor)
index_minor
.
extend
(
not_in_name2
)
index_minor
.
extend
(
dtype_minor
)
index_minor
.
extend
(
character_minor
)
index_minor
=
set
(
index_minor
)
final_df
=
pd
.
DataFrame
(
np
.
zeros
((
len
(
index_minor
),
8
)),
index
=
list
(
index_minor
),
columns
=
[
'计数'
,
'产品类型异常'
,
'产品名称异常'
,
'父品牌异常'
,
'品牌异常'
,
'数据类型异常'
,
'数据格式异常'
,
'数据长度异常'
])
final_df
=
pd
.
DataFrame
(
np
.
zeros
((
len
(
index_minor
),
7
)),
index
=
list
(
index_minor
),
columns
=
[
'计数'
,
'产品类别异常'
,
'产品名称异常'
,
'品牌异常'
,
'数据类型异常'
,
'数据格式异常'
,
'数据长度异常'
])
w_class
=
df_null
[
'产品类别异常权重'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'产品类别异常权重'
]
.
notnull
()]
.
values
w_format
=
df_null
[
'数据格式异常权重'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'数据格式异常权重'
]
.
notnull
()]
.
values
w_length
=
df_null
[
'数据长度异常权重'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'数据长度异常权重'
]
.
notnull
()]
.
values
w_brand
=
df_null
[
'品牌异常权重'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'品牌异常权重'
]
.
notnull
()]
.
values
w_name
=
df_null
[
'产品名称异常权重'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'产品名称异常权重'
]
.
notnull
()]
.
values
w_dtype
=
df_null
[
'数据类型异常权重'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'数据类型异常权重'
]
.
notnull
()]
.
values
w_stdparam
=
df_null
[
'标准参数异常权重'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'标准参数异常权重'
]
.
notnull
()]
.
values
for
i
in
index_minor
:
count
=
0
if
i
in
wrong_class
:
count
+=
1
final_df
.
loc
[
i
,
'产品类
型
异常'
]
=
1
count
+=
w_class
final_df
.
loc
[
i
,
'产品类
别
异常'
]
=
1
if
i
in
format_minor
:
count
+=
len
(
format_minor_dict
[
i
]
.
split
())
# 如果该行数据有多列数据格式异常,就要加多次,
count
+=
len
(
format_minor_dict
[
i
]
.
split
())
*
w_format
# 如果该行数据有多列数据格式异常,就要加多次,
final_df
.
loc
[
i
,
'数据格式异常'
]
=
format_minor_dict
[
i
]
# 但其中数据是空格分割的字符串,所以用split
if
i
in
length_minor
:
count
+=
len
(
length_minor_dict
[
i
]
.
split
())
count
+=
len
(
length_minor_dict
[
i
]
.
split
())
*
w_length
final_df
.
loc
[
i
,
'数据长度异常'
]
=
length_minor_dict
[
i
]
if
i
in
brand_minor
:
count
+=
1
if
i
in
wrong_brand
:
count
+=
w_brand
final_df
.
loc
[
i
,
'品牌异常'
]
=
1
if
i
in
father_brand_minor
:
count
+=
1
final_df
.
loc
[
i
,
'父品牌异常'
]
=
1
#
if i in father_brand_minor:
#
count += 1
#
final_df.loc[i, '父品牌异常'] = 1
if
i
in
not_in_name2
:
count
+=
1
count
+=
w_name
final_df
.
loc
[
i
,
'产品名称异常'
]
=
1
if
i
in
dtype_minor
:
count
+=
len
(
dtype_minor_dict
[
i
]
.
split
())
count
+=
len
(
dtype_minor_dict
[
i
]
.
split
())
*
w_dtype
final_df
.
loc
[
i
,
'数据类型异常'
]
=
dtype_minor_dict
[
i
]
for
keys_i
in
character_minor_dict
:
if
i
in
character_minor_dict
[
keys_i
]:
final_df
.
loc
[
i
,
keys_i
+
'异常'
]
=
1
count
+=
1
count
+=
w_stdparam
else
:
final_df
.
loc
[
i
,
keys_i
+
'异常'
]
=
0
final_df
.
loc
[
i
,
'计数'
]
=
count
...
...
@@ -255,10 +345,10 @@ def class_washing(category, filepath, c_list,a=0.02, b=0.01):
if
__name__
==
'__main__'
:
category
=
'
激光打印机
'
filepath
=
"E:
\\
ZDZC
\\
激光打印机
参数确认.xlsx"
c_list
=
[
6
,
7
,
-
4
,
-
3
]
category
=
'
扫描仪
'
filepath
=
"E:
\\
ZDZC
\\
扫描仪
参数确认.xlsx"
#
c_list=[6,7,-4,-3]
# category = '扫描仪'
# filepath="E:\\ZDZC\\扫描仪参数确认.xlsx"
# c_list=[7,8,9]
class_washing
(
category
,
filepath
,
c_list
)
class_washing
(
category
,
filepath
)
公共代码/产品重复型号分析.py
View file @
c040d93d
...
...
@@ -12,7 +12,7 @@ import numpy as np
import
xlsxwriter
def
product_washing
(
filepath
,
thre
=
1
,
a
=
0
):
def
product_washing
(
filepath
,
category
,
thre
=
1
,
a
=
0
):
df_null
=
pd
.
read_excel
(
".
\\
异常数据表格.xlsx"
)
invalid_list
=
df_null
[
'异常数据名称'
]
.
values
df
=
pd
.
read_excel
(
filepath
,
converters
=
{
'产品编码'
:
str
})
...
...
@@ -28,7 +28,7 @@ def product_washing(filepath, thre=1, a=0):
tempo_dict
=
{}
# 每个品牌提取产品型号关键字,放入tempo_dict
for
i
in
brand
[
1
]
.
index
:
k
=
brand
[
1
]
.
loc
[
i
,
'产品型号'
]
k
=
brand
[
1
]
.
loc
[
i
,
'
*
产品型号'
]
if
k
in
invalid_list
:
continue
pre_num
=
re
.
search
(
r'([A-Za-z]{0,4})\W?\d+'
,
k
)
# pre_num为数字前的关键字
...
...
@@ -85,25 +85,34 @@ def product_washing(filepath, thre=1, a=0):
# temp_list2 = re.findall(r'[A-Za-z]+\W?[A-Za-z]+', brand[0].upper()) # 提取英文单词
# brand_combined = temp_list1+temp_list2
tempo_dict
[
i
]
=
[
set
(
combined
)]
other_parameters
=
df_null
[
'重复参数项'
][
df_null
[
'类别'
]
==
category
][
df_null
[
'重复参数项'
]
.
notnull
()]
.
values
other_parameters_values
=
[]
for
parameter
in
other_parameters
:
other_parameters_values
.
append
(
brand
[
1
]
.
loc
[
i
,
parameter
])
tempo_dict
[
i
]
.
extend
(
other_parameters_values
)
tempo_dict
[
i
]
=
[
set
(
combined
),
brand
[
1
]
.
loc
[
i
,
'*质保时间'
],
brand
[
1
]
.
loc
[
i
,
'标配外服务及配件'
]]
# 对比产品
型号关键字
,相同则放入related_product
# 对比产品
参数项
,相同则放入related_product
tested_product
=
[]
for
i
in
tempo_dict
:
for
j
in
tempo_dict
:
if
i
!=
j
and
set
([
i
,
j
])
not
in
tested_product
:
if
tempo_dict
[
i
][
1
:]
==
tempo_dict
[
j
][
1
:]:
accuracy_i
=
0
accuracy_j
=
0
for
word_i
in
tempo_dict
[
i
][
0
]:
if
word_i
in
tempo_dict
[
j
][
0
]:
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
0
])
+
a
/
len
(
tempo_dict
[
i
][
0
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
0
])
+
a
/
len
(
tempo_dict
[
j
][
0
]))
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
not
(
df
.
loc
[
i
,
'产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'产品型号'
]
.
endswith
(
'+'
)):
related_product
.
append
(
set
([
i
,
j
]))
for
index
in
range
(
1
,
len
(
tempo_dict
[
i
])):
if
tempo_dict
[
i
][
index
]
!=
tempo_dict
[
j
][
index
]
and
\
(
tempo_dict
[
i
][
index
]
not
in
invalid_list
and
tempo_dict
[
j
][
index
]
not
in
invalid_list
):
tested_product
.
append
(
set
([
i
,
j
]))
break
# 如果出现了必须相等但不相等的参数,则退出到上一层循环。如果没有出现,则对比型号
accuracy_i
=
0
accuracy_j
=
0
for
word_i
in
tempo_dict
[
i
][
0
]:
if
word_i
in
tempo_dict
[
j
][
0
]:
accuracy_i
+=
1
/
(
len
(
tempo_dict
[
i
][
0
])
+
a
/
len
(
tempo_dict
[
i
][
0
]))
accuracy_j
+=
1
/
(
len
(
tempo_dict
[
j
][
0
])
+
a
/
len
(
tempo_dict
[
j
][
0
]))
if
accuracy_i
>=
thre
or
accuracy_j
>=
thre
:
if
not
(
df
.
loc
[
i
,
'*产品型号'
]
.
endswith
(
'+'
)
^
df
.
loc
[
j
,
'*产品型号'
]
.
endswith
(
'+'
)):
related_product
.
append
(
set
([
i
,
j
]))
tested_product
.
append
(
set
([
i
,
j
]))
# a = set([i])
...
...
@@ -152,5 +161,6 @@ def product_washing(filepath, thre=1, a=0):
if
__name__
==
'__main__'
:
filepath
=
"E:
\\
ZDZC
\\
激光打印机参数确认.xlsx"
product_washing
(
filepath
)
filepath
=
"E:
\\
ZDZC
\\
扫描仪参数确认.xlsx"
category
=
'扫描仪'
product_washing
(
filepath
,
category
)
公共代码/异常数据表格.xlsx
0 → 100644
View file @
c040d93d
File added
公共代码/激光打印机参数确认.xlsx
0 → 100644
View file @
c040d93d
File added
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment