Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
completeParamsValueWithZOL
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
ZGC_INDEX
completeParamsValueWithZOL
Commits
941d3ec6
Commit
941d3ec6
authored
Dec 07, 2019
by
rico.liu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
init
parent
7c2d9388
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
199 additions
and
0 deletions
+199
-0
ZOL_Crawler.py
ZOL_Crawler.py
+199
-0
No files found.
ZOL_Crawler.py
0 → 100644
View file @
941d3ec6
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 7 20:45:38 2019
@author: rico
"""
import
pymssql
import
pandas
as
pd
import
requests
from
urllib.parse
import
quote
from
lxml
import
etree
class
CRAWLER
:
def
__init__
(
self
,
categorycode
):
self
.
conn
=
pymssql
.
connect
(
'123.56.115.207'
,
'zgcindex'
,
'jiayou2017+2018'
,
'ZI_DataBase'
)
self
.
zol_rel_data
=
self
.
get_zol_attribute_relation
(
categorycode
)
self
.
necessary_attrs
=
self
.
get_necessary_attrs
(
categorycode
)
self
.
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
,
'Accept'
:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
,
'Accept-Language'
:
'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
,
'Referer'
:
'http://detail.zol.com.cn/'
,
'DNT'
:
'1'
,
'Connection'
:
'keep-alive'
,
'Upgrade-Insecure-Requests'
:
'1'
,
'TE'
:
'Trailers'
,
'Cookie'
:
'ip_ck=78CD7v3zj7QuODcyOTc0LjE1NTM1ODc1NjQ
%3
D; zol_index_today_best_close1=today_yes; zol_userid=weixin_716d9jc1; zol_check=2040718347; zol_cipher=fd5cd1e006683322f25e2b9350b5ad1c; zol_sid=52743385; z_pro_city=s_provice
%3
Dsichuan
%26
s_city
%3
Dchengdu; zol_bind_weixin_716d9jc1=1; gr_user_id=4aedd91b-fbef-43ae-8857-e44d1849bdb3; userProvinceId=17; userCityId=386; userCountyId=0; userLocationId=21; realLocationId=21; userFidLocationId=21; lv=1564041560; vn=6; zol_vest_no=weixin_716d9jc1; z_day=izol106129=1&izol101693=1&rdetail=9; gr_session_id_9b437fe8881a7e19=b304517c-a53c-4945-8f7e-e4c67b4963e7; gr_session_id_9b437fe8881a7e19_b304517c-a53c-4945-8f7e-e4c67b4963e7=true; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1561707760,1562816362,1564019660,1564044365; visited_subcateId=0|212|48|892; visited_subcateProId=0-0|212-0|48-0|892-0; listSubcateId=0; Adshow=0; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1564045129; visited_serachKw=S262NV.html
%7
CS262NV
%7
CSF-S262NV
%7
CSF-S601D
%7
CFC-5015AC
%7
CSF-S261NV; questionnaire_pv=1564012830'
}
self
.
basic_url
=
"http://detail.zol.com.cn/index.php?c=SearchList&keyword="
def
get_zol_attribute_relation
(
self
,
zgc_category_code
):
'''
获取ZOL参数对应关系
'''
try
:
cursor
=
self
.
conn
.
cursor
()
cursor
.
execute
(
"SELECT * FROM Product_Relation_Attribute_SubTitle where ZI_SubCategoryCode = '"
+
zgc_category_code
+
"' and Source = 'ZOL'"
)
data_source
=
[
v
for
v
in
cursor
.
fetchall
()]
zol_rel_data
=
pd
.
DataFrame
(
data_source
,
columns
=
[
tuple
[
0
]
for
tuple
in
cursor
.
description
])
cursor
.
close
()
return
zol_rel_data
except
:
print
(
'链接失败,重新链接'
)
return
self
.
get_zol_attribute_relation
(
zgc_category_code
)
def
get_necessary_attrs
(
self
,
zgc_category_code
):
'''
获取对应类别参数项
'''
try
:
cursor
=
self
.
conn
.
cursor
()
cursor
.
execute
(
"select distinct SubTitle from vw_relation_property where SubCategoryCode = '"
+
zgc_category_code
+
"' and (ISimportant = 1 or ispeijian = 1)"
)
data_source
=
[
v
for
v
in
cursor
.
fetchall
()]
attribute_data_list
=
pd
.
DataFrame
(
data_source
,
columns
=
[
tuple
[
0
]
for
tuple
in
cursor
.
description
])[
'SubTitle'
]
.
tolist
()
cursor
.
close
()
self
.
conn
.
close
()
return
attribute_data_list
except
:
print
(
'链接失败,重新链接'
)
return
self
.
get_necessary_attrs
(
zgc_category_code
)
def
get_reponse
(
self
,
url
):
'''
deal timeout request
'''
try
:
response
=
requests
.
get
(
url
,
headers
=
self
.
headers
,
timeout
=
5
)
if
response
.
status_code
==
200
:
return
response
except
:
print
(
'请求错误,重新链接'
)
return
self
.
get_reponse
(
url
)
def
crawl_zol
(
self
,
kw
):
'''
爬取参数项参数值、返回字典
'''
kw
=
quote
(
kw
,
encoding
=
'gbk'
)
url
=
self
.
basic_url
+
kw
#东芝2823am
res
=
self
.
get_reponse
(
url
)
if
res
==
-
1
:
print
(
'链接超时,检查是否被封锁IP'
)
res
=
{}
return
res
html
=
etree
.
HTML
(
res
.
text
)
combine_url
=
False
try
:
tag
=
html
.
xpath
(
"//*[@class='list-item clearfix']//div[@class='pic-box SP']/a/@href"
)
basic_url
=
"http://detail.zol.com.cn"
for
i
in
range
(
len
(
tag
)):
if
'http'
in
tag
[
i
]:
continue
else
:
combine_url
=
basic_url
+
tag
[
i
]
break
except
:
combine_url
=
False
if
combine_url
==
False
:
res
=
{}
print
(
'抱歉,未找到该产品'
)
return
res
detail
=
self
.
get_reponse
(
combine_url
)
if
detail
==
-
1
:
print
(
'链接超时,检查是否被封锁IP'
)
res
=
{}
return
res
html
=
etree
.
HTML
(
detail
.
text
)
try
:
more
=
html
.
xpath
(
"//a[@class='_j_MP_more more']/@href"
)
more_url
=
basic_url
+
more
[
0
]
except
:
res
=
{}
print
(
'抱歉, 无产品详情'
)
return
res
source
=
self
.
get_reponse
(
more_url
)
if
source
==
-
1
:
print
(
'链接超时,检查是否被封锁IP'
)
res
=
{}
return
res
source
=
source
.
text
.
replace
(
'<br />'
,
''
)
html
=
etree
.
HTML
(
source
)
#get Zol attribute and value
Zol_data
=
pd
.
DataFrame
()
attr_list
=
html
.
xpath
(
"//span[contains(@id,'newPmName')]/text()"
)
attr_list
=
[
v
.
strip
()
for
v
in
attr_list
]
value_list
=
[]
for
attr
in
attr_list
:
v
=
html
.
xpath
(
"//span[contains(text(),'"
+
attr
+
"')]/../following-sibling::td[1]/span//text()"
)
if
isinstance
(
v
,
list
):
v
=
''
.
join
(
v
)
value_list
.
append
(
v
)
else
:
value_list
.
append
(
v
)
value_list
=
[
v
.
strip
()
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\r
'
,
''
)
for
v
in
value_list
]
Zol_data
[
'attr'
]
=
attr_list
Zol_data
[
'value'
]
=
value_list
#create zol relationship attribute dict
dic
=
dict
()
for
attr
in
self
.
necessary_attrs
:
dic
[
attr
]
=
[
v
if
","
not
in
str
(
v
)
else
[
v_
for
v_
in
v
.
split
(
","
)]
for
v
in
self
.
zol_rel_data
[
self
.
zol_rel_data
[
'ZI_SubTitle'
]
==
attr
][
'Other_SubTitle'
]
.
unique
()
.
tolist
()]
#get need data
get_data_list
=
[]
for
std_attr
in
self
.
necessary_attrs
:
get_value
=
''
if
dic
[
std_attr
]:
for
attr
in
dic
[
std_attr
]:
if
isinstance
(
attr
,
list
):
for
attr_child
in
attr
:
for
attr_c
,
value_c
in
zip
(
attr_list
,
value_list
):
if
attr_child
==
attr_c
:
get_value
=
value_c
get_data_list
.
append
(
get_value
)
break
if
get_value
:
break
if
get_value
:
break
else
:
for
attr_c
,
value_c
in
zip
(
attr_list
,
value_list
):
if
attr
==
attr_c
:
get_value
=
value_c
get_data_list
.
append
(
get_value
)
break
if
get_value
==
''
:
get_data_list
.
append
(
"爬取不到数据"
)
res
=
dict
([(
k
,
v
)
for
k
,
v
in
zip
(
self
.
necessary_attrs
,
get_data_list
)])
return
res
if
__name__
==
'__main__'
:
crawler
=
CRAWLER
(
'0506'
)
res
=
crawler
.
crawl_zol
(
'东芝2823am'
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment