Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
中
中电中采
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
中电中采
Commits
8d7a546a
Commit
8d7a546a
authored
Dec 22, 2020
by
rico.liu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add crawl params and pic
parent
2ebdc9ca
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
140 additions
and
14 deletions
+140
-14
main.py
模板建库/main.py
+140
-14
No files found.
模板建库/main.py
View file @
8d7a546a
...
@@ -363,6 +363,62 @@ def GetCollectData(batch,channel_alias):
...
@@ -363,6 +363,62 @@ def GetCollectData(batch,channel_alias):
return
df_db
return
df_db
#爬去链接参数及图片 (暂时只支持一个产品对应一个链接,多链接取第一个链接信息)
def
GetParamsinfoAndPic
(
df
):
mssql_new
=
MSSQL
(
'123.56.115.207'
,
'ZI_NEW'
)
cursor_zi_new
=
mssql_new
.
_cur
mssql
=
MSSQL
(
'123.57.45.119'
,
'ZI_Service'
)
cursor_zi_service
=
mssql
.
_cur
#价格渠道字典
cursor_zi_new
.
execute
(
f
"select channel_alias_cn,channel_alias_code from zdindex_channel_rel"
)
price_source
=
pd
.
DataFrame
(
cursor_zi_new
.
fetchall
(),
columns
=
[
tuple
[
0
]
for
tuple
in
cursor_zi_new
.
description
])
#请求地址
request_url
=
"http://59.110.219.171:8092/return_data"
#组织请求数据
price_source_dict
=
dict
(
zip
(
price_source
[
'channel_alias_code'
]
.
tolist
(),
price_source
[
'channel_alias_cn'
]
.
tolist
()))
data_list
=
str
({
'data'
:[[
str
(
eval
(
url
)[
0
]),
str
(
price_source_dict
[
eval
(
source
)[
0
]])]
for
url
,
source
in
zip
(
df
[
'url'
]
.
tolist
(),
df
[
'url_source'
]
.
tolist
())]})
.
replace
(
"'"
,
"
\"
"
)
payload
=
{
'dataList'
:
data_list
}
response
=
requests
.
request
(
"POST"
,
request_url
,
data
=
payload
)
res
=
eval
(
response
.
text
)
df
[
'url_pic'
]
=
[
str
(
element
[
'img_list'
])
for
element
in
res
]
crawl_params_list
=
[
str
(
element
[
'class_list'
])
.
replace
(
"'': ''"
,
""
)
.
replace
(
", ,"
,
","
)
.
replace
(
"{,"
,
"{"
)
.
replace
(
" "
,
""
)
for
element
in
res
]
url_params_list
=
[]
for
element
,
url_params
in
zip
(
res
,
crawl_params_list
):
params_dict
=
eval
(
url_params
)
params_dict
.
update
({
'爬取链接'
:
element
[
'url'
]})
url_params_list
.
append
(
str
(
params_dict
))
df
[
'url_params'
]
=
url_params_list
#储存爬取的信息
#实例化进度条
index_
=
Index
()
counter
=
1
for
index
,
row
in
df
.
iterrows
():
try
:
print
(
index_
(
counter
,
len
(
df
)
-
1
),
end
=
'
%
'
)
except
:
print
(
index_
(
counter
,
1
),
end
=
'
%
'
)
counter
+=
1
id_
=
row
[
'id'
]
url_pic
=
row
[
'url_pic'
]
.
replace
(
"'"
,
"''"
)
url_params
=
row
[
'url_params'
]
.
replace
(
"'"
,
"''"
)
cursor_zi_service
.
execute
(
f
"update product_all set url_params = '{url_params}',url_pic = '{url_pic}' where id = {id_}"
)
print
(
'爬去数据存储完成'
)
mssql
.
Close
()
mssql_new
.
Close
()
return
df
#解析重点类产品数据并导出
#解析重点类产品数据并导出
def
GetCollectDataDetail
(
df
,
channel_alias
,
batch
):
def
GetCollectDataDetail
(
df
,
channel_alias
,
batch
):
...
@@ -419,7 +475,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
...
@@ -419,7 +475,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
writer
=
pd
.
ExcelWriter
(
f
"{channel_alias}建库产品参数确认{batch}.xlsx"
)
writer
=
pd
.
ExcelWriter
(
f
"{channel_alias}建库产品参数确认{batch}.xlsx"
)
index
=
0
for
category
in
df
[
'zi_subcategoryname'
]
.
unique
()
.
tolist
():
for
category
in
df
[
'zi_subcategoryname'
]
.
unique
()
.
tolist
():
#获取每一个品类的dataframe
#获取每一个品类的dataframe
...
@@ -430,6 +487,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
...
@@ -430,6 +487,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
single_subtitle_df
=
subtitle_df
[
subtitle_df
[
'name'
]
==
category
]
single_subtitle_df
=
subtitle_df
[
subtitle_df
[
'name'
]
==
category
]
#获取这个类的必填属性规格(CPU属性无需填写,系统在建库时自动给出)
#获取这个类的必填属性规格(CPU属性无需填写,系统在建库时自动给出)
requier_param_list
=
single_subtitle_df
[
single_subtitle_df
[
'require_param'
]
==
'1'
][
'subtitle'
]
.
tolist
()
requier_param_list
=
single_subtitle_df
[
single_subtitle_df
[
'require_param'
]
==
'1'
][
'subtitle'
]
.
tolist
()
#获取这个类的参数项对应关系
single_subtitle_map_df
=
subtitle_map_df
[
subtitle_map_df
[
'categoryname'
]
==
category
]
#获取这个类的非必填
#获取这个类的非必填
non_requier_param_list
=
[]
non_requier_param_list
=
[]
...
@@ -445,8 +504,10 @@ def GetCollectDataDetail(df,channel_alias,batch):
...
@@ -445,8 +504,10 @@ def GetCollectDataDetail(df,channel_alias,batch):
pass
pass
requier_param_list
=
[
str
(
param
)
+
"(*)"
for
param
in
requier_param_list
]
requier_param_list
=
[
str
(
param
)
+
"(*)"
for
param
in
requier_param_list
]
#将爬取信息放到最后
param_list_all
=
requier_param_list
+
non_requier_param_list
param_list_all
=
requier_param_list
+
non_requier_param_list
param_list_all
.
remove
(
"原始参数及链接信息"
)
param_list_all
.
append
(
"原始参数及链接信息"
)
id_list
=
list
()
id_list
=
list
()
name_list
=
list
()
name_list
=
list
()
...
@@ -455,12 +516,37 @@ def GetCollectDataDetail(df,channel_alias,batch):
...
@@ -455,12 +516,37 @@ def GetCollectDataDetail(df,channel_alias,batch):
param_list
=
list
()
param_list
=
list
()
value_list
=
list
()
value_list
=
list
()
new_name_list
=
list
()
new_name_list
=
list
()
#实例化进度条
index_
=
Index
()
counter
=
1
for
index
,
row
in
cat_df
.
iterrows
():
for
index
,
row
in
cat_df
.
iterrows
():
try
:
print
(
index_
(
counter
,
len
(
cat_df
)
-
1
),
end
=
'
%
'
)
except
:
print
(
index_
(
counter
,
1
),
end
=
'
%
'
)
counter
+=
1
id_
=
row
[
'id'
]
id_
=
row
[
'id'
]
name
=
row
[
'name'
]
name
=
row
[
'name'
]
zi_brandname
=
row
[
'zi_brandname'
]
zi_brandname
=
row
[
'zi_brandname'
]
zi_subcategoryname
=
row
[
'zi_subcategoryname'
]
zi_subcategoryname
=
row
[
'zi_subcategoryname'
]
url_params
=
eval
(
row
[
'url_params'
])
std_key_list
=
[]
std_value_list
=
[]
for
key
in
url_params
.
keys
():
judge_df
=
single_subtitle_map_df
[
single_subtitle_map_df
[
'outsubtitle'
]
==
key
][[
'subtitle'
]]
if
judge_df
.
empty
:
continue
else
:
std_key_list
.
append
(
judge_df
[
'subtitle'
]
.
tolist
()[
0
])
std_value_list
.
append
(
url_params
[
key
])
std_url_params
=
dict
(
zip
(
std_key_list
,
std_value_list
))
for
param
in
param_list_all
:
for
param
in
param_list_all
:
...
@@ -470,9 +556,22 @@ def GetCollectDataDetail(df,channel_alias,batch):
...
@@ -470,9 +556,22 @@ def GetCollectDataDetail(df,channel_alias,batch):
zi_brandname_list
.
append
(
zi_brandname
)
zi_brandname_list
.
append
(
zi_brandname
)
zi_subcategoryname_list
.
append
(
zi_subcategoryname
)
zi_subcategoryname_list
.
append
(
zi_subcategoryname
)
param_list
.
append
(
param
)
param_list
.
append
(
param
)
value_list
.
append
(
''
)
new_name_list
.
append
(
''
)
new_name_list
.
append
(
''
)
if
param
==
'原始参数及链接信息'
:
value
=
row
[
'url_params'
]
else
:
try
:
value
=
url_params
[
param
]
except
:
try
:
value
=
std_url_params
[
param
]
except
:
value
=
''
value_list
.
append
(
value
)
export_df
=
pd
.
DataFrame
()
export_df
=
pd
.
DataFrame
()
export_df
[
'id'
]
=
id_list
export_df
[
'id'
]
=
id_list
export_df
[
'name'
]
=
name_list
export_df
[
'name'
]
=
name_list
...
@@ -484,6 +583,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
...
@@ -484,6 +583,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
category
=
category
.
replace
(
'/'
,
'_'
)
category
=
category
.
replace
(
'/'
,
'_'
)
export_df
.
to_excel
(
writer
,
f
"{category}参数数据"
)
export_df
.
to_excel
(
writer
,
f
"{category}参数数据"
)
print
(
f
"完成{category}参数整理"
)
writer
.
save
()
writer
.
save
()
mssql
.
Close
()
mssql
.
Close
()
...
@@ -1713,8 +1814,11 @@ UpdateBasicData(path)
...
@@ -1713,8 +1814,11 @@ UpdateBasicData(path)
#获取建库数据
#获取建库数据
df
=
GetCollectData
(
batch
,
channel_alias
)
df
=
GetCollectData
(
batch
,
channel_alias
)
#爬去链接参数信息及图片
df_crawled
=
GetParamsinfoAndPic
(
df
)
#导出建库数据参数补充
#导出建库数据参数补充
GetCollectDataDetail
(
df
,
channel_alias
,
batch
)
GetCollectDataDetail
(
df
_crawled
,
channel_alias
,
batch
)
#处理建库数据
#处理建库数据
...
@@ -1749,14 +1853,6 @@ cursor_zi_service = mssql._cur
...
@@ -1749,14 +1853,6 @@ cursor_zi_service = mssql._cur
#入库
#入库
id_list
=
df
[
'id'
]
.
tolist
()
id_list
=
df
[
'id'
]
.
tolist
()
source_name_list
=
df
[
'new_name'
]
.
tolist
()
price_list
=
df
[
'url_price'
]
.
apply
(
lambda
x
:
eval
(
x
)[
0
])
.
tolist
()
url_list
=
df
[
'url'
]
.
apply
(
lambda
x
:
eval
(
x
)[
0
])
.
tolist
()
channelId_list
=
df
[
'url_source'
]
.
apply
(
lambda
x
:
eval
(
x
)[
0
])
.
tolist
()
brand_list
=
df
[
'zi_brandname'
]
.
tolist
()
brand_list
=
df
[
'zi_brandname'
]
.
tolist
()
brandId_list
=
df
[
'zi_brandcode'
]
.
apply
(
lambda
x
:
str
(
x
))
.
tolist
()
brandId_list
=
df
[
'zi_brandcode'
]
.
apply
(
lambda
x
:
str
(
x
))
.
tolist
()
...
@@ -1765,10 +1861,15 @@ category_list = df['zi_subcategoryname'].tolist()
...
@@ -1765,10 +1861,15 @@ category_list = df['zi_subcategoryname'].tolist()
categoryId_list
=
df
[
'zi_subcategorycode'
]
.
apply
(
lambda
x
:
str
(
x
))
.
tolist
()
categoryId_list
=
df
[
'zi_subcategorycode'
]
.
apply
(
lambda
x
:
str
(
x
))
.
tolist
()
params_list
=
df
[
'params_standard'
]
.
apply
(
lambda
x
:
eval
(
x
))
.
tolist
()
name_list
=
df
[
'new_name'
]
.
apply
(
lambda
x
:
x
.
replace
(
"'"
,
"''"
))
.
tolist
()
name_list
=
df
[
'new_name'
]
.
apply
(
lambda
x
:
x
.
replace
(
"'"
,
"''"
))
.
tolist
()
params_list
=
[]
for
params_standard
,
url_params
in
zip
(
df
[
'params_standard'
]
.
tolist
(),
df
[
'url_params'
]
.
tolist
()):
params_standard_dict
=
eval
(
params_standard
)
params_standard_dict
.
update
({
'原始参数及链接信息'
:
url_params
})
params_list
.
append
(
params_standard_dict
)
data
=
{
data
=
{
"params_info"
:
{
"params_info"
:
{
"brand_list"
:
brand_list
,
"brand_list"
:
brand_list
,
...
@@ -1786,6 +1887,31 @@ sku_list = res['sku_list']
...
@@ -1786,6 +1887,31 @@ sku_list = res['sku_list']
for
_id
,
sku
in
zip
(
id_list
,
sku_list
):
for
_id
,
sku
in
zip
(
id_list
,
sku_list
):
cursor_zi_service
.
execute
(
f
"update product_all set productcode = '{sku}',remark = Null,state = '9' where id = {_id}"
)
cursor_zi_service
.
execute
(
f
"update product_all set productcode = '{sku}',remark = Null,state = '9' where id = {_id}"
)
df
[
'productcode'
]
=
sku_list
#组织价格数据
sku_list
=
[]
source_name_list
=
[]
price_list
=
[]
url_list
=
[]
channelId_list
=
[]
for
index
,
row
in
df
.
iterrows
():
sku_list_temp
=
row
[
'productcode'
]
source_name_list_temp
=
row
[
'new_name'
]
.
replace
(
"'"
,
"''"
)
url_list_temp
=
eval
(
row
[
'url'
])
channelId_list_temp
=
eval
(
row
[
'url_source'
])
price_list_temp
=
eval
(
row
[
'url_price'
])
for
url
,
channelId
,
price
in
zip
(
url_list_temp
,
channelId_list_temp
,
price_list_temp
):
sku_list
.
append
(
sku_list_temp
)
source_name_list
.
append
(
source_name_list_temp
)
url_list
.
append
(
url
)
channelId_list
.
append
(
channelId
)
price_list
.
append
(
price
)
#价格关系入库
#价格关系入库
data
=
{
data
=
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment