Commit 4b96f5b3 authored by rico.liu's avatar rico.liu

update

parent 69cdf41f
...@@ -144,6 +144,17 @@ class CRAWLER: ...@@ -144,6 +144,17 @@ class CRAWLER:
source = source.text.replace('<br />','') source = source.text.replace('<br />','')
html = etree.HTML(source) html = etree.HTML(source)
#get category and product name
try:
category = html.xpath("//div[@class='breadcrumb']/a[2]/text()")[0]
except:
category = '无法获取类别'
try:
product_name = html.xpath("//div[@class='breadcrumb']/a[4]/text()")[0]
except:
product_name = '无法获取产品名称'
#get Zol attribute and value #get Zol attribute and value
Zol_data = pd.DataFrame() Zol_data = pd.DataFrame()
attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()") attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()")
...@@ -193,9 +204,10 @@ class CRAWLER: ...@@ -193,9 +204,10 @@ class CRAWLER:
break break
if get_value == '': if get_value == '':
get_data_list.append("爬取不到数据") get_data_list.append("爬取不到数据")
res = dict([(k,v) for k,v in zip(self.necessary_attrs,get_data_list)]) res = {'产品类别':category,'产品名称':product_name}
res.update(dict([(k,v) for k,v in zip(self.necessary_attrs,get_data_list)]))
return res return res
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment