Commit a4413305 authored by rico.liu's avatar rico.liu

update

parent 94ca7eb4
...@@ -144,6 +144,17 @@ class CRAWLER: ...@@ -144,6 +144,17 @@ class CRAWLER:
source = source.text.replace('<br />','') source = source.text.replace('<br />','')
html = etree.HTML(source) html = etree.HTML(source)
#get category and product name
try:
category = html.xpath("//div[@class='breadcrumb']/a[2]/text()")[0]
except:
category = '无法获取类别'
try:
product_name = html.xpath("//div[@class='breadcrumb']/a[4]/text()")[0]
except:
product_name = '无法获取产品名称'
#get Zol attribute and value #get Zol attribute and value
Zol_data = pd.DataFrame() Zol_data = pd.DataFrame()
attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()") attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()")
...@@ -194,7 +205,8 @@ class CRAWLER: ...@@ -194,7 +205,8 @@ class CRAWLER:
if get_value == '': if get_value == '':
get_data_list.append("爬取不到数据") get_data_list.append("爬取不到数据")
res = dict([(k,v) for k,v in zip(self.necessary_attrs,get_data_list)]) res = {'产品类别':category,'产品名称':product_name}
res.update(dict([(k,v) for k,v in zip(self.necessary_attrs,get_data_list)]))
return res return res
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment