update

4b96f5b3 · rico.liu · 69cdf41f · 4b96f5b3
Commit 4b96f5b3 authored Mar 31, 2021 by rico.liu
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 3 deletions

ZOL_Crawler.py 公共代码/ZOL_Crawler.py +15 -3

No files found.
--- a/公共代码/ZOL_Crawler.py
+++ b/公共代码/ZOL_Crawler.py
@@ -144,6 +144,17 @@ class CRAWLER:
        source = source.text.replace('<br />','')          
        html = etree.HTML(source)
+        #get category and product name
+        try:
+            category = html.xpath("//div[@class='breadcrumb']/a[2]/text()")[0]
+        except:
+            category = '无法获取类别'
+        try:
+            product_name = html.xpath("//div[@class='breadcrumb']/a[4]/text()")[0]
+        except:
+            product_name = '无法获取产品名称'
        #get Zol attribute and value
        Zol_data = pd.DataFrame()
        attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()")
@@ -193,9 +204,10 @@ class CRAWLER:
                                break
            if get_value == '':
                get_data_list.append("爬取不到数据") 
-        res = dict([(k,v) for k,v in zip(self.necessary_attrs,get_data_list)])
+        res = {'产品类别':category,'产品名称':product_name}
+        res.update(dict([(k,v) for k,v in zip(self.necessary_attrs,get_data_list)]))
        return res