Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
重
重点类信息提取
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
重点类信息提取
Commits
62dacfbd
Commit
62dacfbd
authored
May 08, 2021
by
Jialin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
代码更新
parent
e9bce7fd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
132 additions
and
1 deletion
+132
-1
产品品牌分析.py
公共代码/产品品牌分析.py
+132
-1
No files found.
公共代码/产品品牌分析.py
View file @
62dacfbd
...
...
@@ -8,10 +8,141 @@ import xlsxwriter
import
numpy
as
np
import
pymssql
def
brand_washing_special
(
filepath
,
sheet_name
,
category
):
df
=
pd
.
read_excel
(
filepath
,
sheet_name
=
sheet_name
,
converters
=
{
'产品编码'
:
str
})
conn_zi_new
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'zgcprice20200628'
,
database
=
'ZI_NEW'
,
autocommit
=
True
)
cursor_zi_new
=
conn_zi_new
.
cursor
()
cursor_zi_new
.
execute
(
f
"select col_value from data_washing_external where category_name='通用' and col_name='异常数据名称'"
)
invalid_list_fetch
=
cursor_zi_new
.
fetchall
()
invalid_list
=
[]
for
invalid_tuple
in
invalid_list_fetch
:
invalid_list
.
append
(
invalid_tuple
[
0
])
# 处理缺失值
valid_index
=
[]
if
category
==
'复印纸'
:
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
'产品品牌'
]
not
in
invalid_list
and
df
.
loc
[
i
,
'产品系列(SPU)'
]
not
in
invalid_list
:
valid_index
.
append
(
i
)
result
=
df
.
loc
[
valid_index
,
[
'产品品牌'
,
'产品系列(SPU)'
]]
# 将df数据格式转为字符串
result
=
result
.
apply
(
lambda
x
:
x
.
astype
(
str
))
# def xilie_clean(x):
# x = x.strip().replace('系列','').upper()
# ch = re.findall(r'([\u4e00-\u9fa5]+)', x)
# en = re.findall(r'[0-9a-zA-Z]+', x)
# if ch and en:
# return
result
[
'产品系列(SPU)'
]
=
result
[
'产品系列(SPU)'
]
.
apply
(
lambda
x
:
x
.
strip
()
.
replace
(
'系列'
,
''
)
.
upper
())
result_groupby
=
dict
(
list
(
result
.
groupby
(
'产品品牌'
)))
brand_dict
=
{}
for
brand
in
result_groupby
.
keys
():
xilie_list
=
result_groupby
[
brand
][
'产品系列(SPU)'
]
.
unique
()
.
tolist
()
try
:
xilie_list
.
remove
(
'彩色复印纸'
)
except
:
pass
brand_dict
[
brand
]
=
xilie_list
else
:
for
i
in
df
.
index
:
if
df
.
loc
[
i
,
'产品品牌'
]
not
in
invalid_list
and
df
.
loc
[
i
,
'*产品系列'
]
not
in
invalid_list
:
valid_index
.
append
(
i
)
result
=
df
.
loc
[
valid_index
,[
'产品品牌'
,
'*产品系列'
]]
# 将df数据格式转为字符串
result
=
result
.
apply
(
lambda
x
:
x
.
astype
(
str
))
# def xilie_clean(x):
# x = x.strip().replace('系列','').upper()
# ch = re.findall(r'([\u4e00-\u9fa5]+)', x)
# en = re.findall(r'[0-9a-zA-Z]+', x)
# if ch and en:
# return
result
[
'*产品系列'
]
=
result
[
'*产品系列'
]
.
apply
(
lambda
x
:
x
.
strip
()
.
replace
(
'系列'
,
''
)
.
upper
())
result_groupby
=
dict
(
list
(
result
.
groupby
(
'产品品牌'
)))
brand_dict
=
{}
for
brand
in
result_groupby
.
keys
():
xilie_list
=
result_groupby
[
brand
][
'*产品系列'
]
.
unique
()
.
tolist
()
brand_dict
[
brand
]
=
xilie_list
related_brand
=
[]
tested_brand
=
[]
repeated_xilie
=
[]
for
brand_i
in
brand_dict
.
keys
():
for
brand_j
in
brand_dict
.
keys
():
if
brand_i
==
brand_j
or
{
brand_i
,
brand_j
}
in
tested_brand
:
continue
temp_repeated_xilie
=
[]
for
xilie
in
brand_dict
[
brand_i
]:
if
xilie
in
brand_dict
[
brand_j
]:
temp_repeated_xilie
.
append
(
xilie
)
if
temp_repeated_xilie
:
repeated_xilie
.
append
(
temp_repeated_xilie
)
related_brand
.
append
([
brand_i
,
brand_j
])
tested_brand
.
append
({
brand_i
,
brand_j
})
#输出到excel里
workbook
=
xlsxwriter
.
Workbook
(
'./brand_filter.xlsx'
)
bold_format
=
workbook
.
add_format
({
'bold'
:
True
})
worksheet
=
workbook
.
add_worksheet
(
name
=
'Sheet1'
)
worksheet
.
write
(
'A1'
,
'品牌A'
,
bold_format
)
worksheet
.
write
(
'B1'
,
'品牌B'
,
bold_format
)
worksheet
.
write
(
'C1'
,
'正确品牌'
,
bold_format
)
col
=
0
row
=
1
for
brand_list
in
related_brand
:
for
brand
in
brand_list
:
worksheet
.
write_string
(
row
,
col
,
brand
)
col
+=
1
row
+=
1
col
=
0
worksheet2
=
workbook
.
add_worksheet
(
name
=
'Sheet2'
)
worksheet2
.
write
(
'A1'
,
'品牌A'
,
bold_format
)
worksheet2
.
write
(
'B1'
,
'品牌B'
,
bold_format
)
worksheet2
.
write
(
'C1'
,
'重复系列'
,
bold_format
)
row
=
1
col
=
0
for
i
in
range
(
len
(
related_brand
)):
worksheet2
.
write_string
(
row
,
col
,
related_brand
[
i
][
0
])
col
+=
1
worksheet2
.
write_string
(
row
,
col
,
related_brand
[
i
][
1
])
col
+=
1
worksheet2
.
write_string
(
row
,
col
,
f
"{repeated_xilie[i]}"
)
row
+=
1
col
=
0
worksheet3
=
workbook
.
add_worksheet
(
name
=
'Sheet3'
)
row
=
0
col
=
0
for
key
in
brand_dict
.
keys
():
worksheet3
.
write_string
(
row
,
col
,
key
)
col
+=
1
worksheet3
.
write_string
(
row
,
col
,
f
"{brand_dict[key]}"
)
col
=
0
row
+=
1
workbook
.
close
()
def
brand_washing
(
filepath
,
thre
=
0.5
,
inner_thre
=
0.5
,
a
=
1
,
sheet_name
=
0
):
# filepath:文件路径,thre为两个品牌下型号重合率阈值,inner_thre为两个品牌下某条型号内关键词重合率阈值,a为权重调整,sheet_name为表单名
df
=
pd
.
read_excel
(
filepath
,
sheet_name
=
sheet_name
,
converters
=
{
'产品编码'
:
str
})
category
=
df
.
loc
[
1
,
'产品类别'
]
wrong_category_list
=
[
'台式机'
,
'笔记本'
,
'一体电脑'
,
'复印纸'
]
if
category
in
wrong_category_list
:
brand_washing_special
(
filepath
,
sheet_name
,
category
)
return
conn_zi_new
=
pymssql
.
connect
(
host
=
'123.56.115.207'
,
user
=
'zgcprice3311'
,
password
=
'zgcprice20200628'
,
database
=
'ZI_NEW'
,
autocommit
=
True
)
cursor_zi_new
=
conn_zi_new
.
cursor
()
...
...
@@ -277,6 +408,6 @@ def brand_washing(filepath,thre=0.5,inner_thre=0.5,a=1,sheet_name=0):
workbook
.
close
()
if
__name__
==
'__main__'
:
filepath
=
'E:
\\
ZDZC
\\
激光打印机
参数确认.xlsx'
filepath
=
'E:
\\
ZDZC
\\
Sourcetree_local
\\
公共代码
\\
复印纸
参数确认.xlsx'
brand_washing
(
filepath
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment