Commit 69cdf41f authored by rico.liu's avatar rico.liu

add

parent 1ac5e534
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 7 20:45:38 2019
@author: rico
"""
import pymssql
import pandas as pd
import requests
from urllib.parse import quote
from lxml import etree
class CRAWLER:
def __init__(self, zgc_categoryname):
self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='zgcprice20200628',database= 'ZI_NEW',autocommit=True)
self.zol_rel_data = self.get_zol_attribute_relation(zgc_categoryname)
self.necessary_attrs = self.get_necessary_attrs(zgc_categoryname)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Referer': 'http://detail.zol.com.cn/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'Trailers',
'Cookie':'ip_ck=78CD7v3zj7QuODcyOTc0LjE1NTM1ODc1NjQ%3D; zol_index_today_best_close1=today_yes; zol_userid=weixin_716d9jc1; zol_check=2040718347; zol_cipher=fd5cd1e006683322f25e2b9350b5ad1c; zol_sid=52743385; z_pro_city=s_provice%3Dsichuan%26s_city%3Dchengdu; zol_bind_weixin_716d9jc1=1; gr_user_id=4aedd91b-fbef-43ae-8857-e44d1849bdb3; userProvinceId=17; userCityId=386; userCountyId=0; userLocationId=21; realLocationId=21; userFidLocationId=21; lv=1564041560; vn=6; zol_vest_no=weixin_716d9jc1; z_day=izol106129=1&izol101693=1&rdetail=9; gr_session_id_9b437fe8881a7e19=b304517c-a53c-4945-8f7e-e4c67b4963e7; gr_session_id_9b437fe8881a7e19_b304517c-a53c-4945-8f7e-e4c67b4963e7=true; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1561707760,1562816362,1564019660,1564044365; visited_subcateId=0|212|48|892; visited_subcateProId=0-0|212-0|48-0|892-0; listSubcateId=0; Adshow=0; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1564045129; visited_serachKw=S262NV.html%7CS262NV%7CSF-S262NV%7CSF-S601D%7CFC-5015AC%7CSF-S261NV; questionnaire_pv=1564012830'
}
self.basic_url = "https://detail.zol.com.cn/index.php?c=SearchList&keyword="
def get_zol_attribute_relation(self,zgc_categoryname):
'''
获取ZOL参数对应关系
'''
try:
zol_rel_data = pd.DataFrame()
cursor = self.conn.cursor()
cursor.execute(f"SELECT * FROM Product_Relation_Attribute_Subtitle where ZI_SubCategoryCode = '{zgc_categoryname}' and Source = 'ZOL'")
try:
data_source = [v for v in cursor.fetchall()]
except:
print(str(zgc_categoryname) + " 该类别产品参数项无zol对应关系")
return zol_rel_data
zol_rel_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
cursor.close()
return zol_rel_data
except:
print('链接失败,重新链接')
return self.get_zol_attribute_relation(zgc_categoryname)
def get_necessary_attrs(self,zgc_categoryname):
'''
获取对应类别参数项
'''
try:
cursor = self.conn.cursor()
cursor.execute(f"select subtitle from vw_property where name = '{zgc_categoryname}' and left(identy,1) = 1")
data_source = [v for v in cursor.fetchall()]
attribute_data_list = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])['subtitle'].tolist()
cursor.close()
self.conn.close()
return attribute_data_list
except:
print('链接失败,重新链接')
return self.get_necessary_attrs(zgc_categoryname)
def get_reponse(self,url):
'''
deal timeout request
'''
try:
response = requests.get(url, headers=self.headers, timeout=5)
if response.status_code == 200:
return response
except:
print('请求错误,重新链接')
return self.get_reponse(url)
def crawl_zol(self,kw):
'''
爬取参数项参数值、返回字典
'''
kw = quote(kw,encoding='gbk')
url = self.basic_url + kw #东芝2823am
res =self.get_reponse(url)
if res == -1:
print('链接超时,检查是否被封锁IP')
res = {}
return res
html = etree.HTML(res.text)
combine_url = False
try:
tag = html.xpath("//*[@class='list-item clearfix']//div[@class='pic-box SP']/a/@href")
basic_url = "http://detail.zol.com.cn"
for i in range(len(tag)):
if 'http' in tag[i]:
continue
else:
combine_url = basic_url + tag[i]
break
except:
combine_url = False
if combine_url == False:
res = {}
print('抱歉,未找到该产品')
return res
detail = self.get_reponse(combine_url)
if detail == -1:
print('链接超时,检查是否被封锁IP')
res = {}
return res
html = etree.HTML(detail.text)
try:
more = html.xpath("//a[@class='_j_MP_more more']/@href")
more_url = basic_url + more[0]
except:
res = {}
print('抱歉, 无产品详情')
return res
source = self.get_reponse(more_url)
if source == -1:
print('链接超时,检查是否被封锁IP')
res = {}
return res
source = source.text.replace('<br />','')
html = etree.HTML(source)
#get Zol attribute and value
Zol_data = pd.DataFrame()
attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()")
attr_list = [v.strip() for v in attr_list]
value_list = []
for attr in attr_list:
v = html.xpath("//span[contains(text(),'"+attr+"')]/../following-sibling::td[1]/span//text()")
if isinstance(v,list):
v = ''.join(v)
value_list.append(v)
else:
value_list.append(v)
value_list = [v.strip().replace('\n','').replace('\r','') for v in value_list]
Zol_data['attr'] = attr_list
Zol_data['value'] = value_list
#create zol relationship attribute dict
dic = dict()
for attr in self.necessary_attrs:
dic[attr] = [v if "," not in str(v) else [v_ for v_ in v.split(",")] for v in self.zol_rel_data[self.zol_rel_data['ZI_SubTitle'] == attr]['Other_SubTitle'].unique().tolist()]
#get need data
get_data_list = []
for std_attr in self.necessary_attrs:
get_value = ''
if dic[std_attr]:
for attr in dic[std_attr]:
if isinstance(attr,list):
for attr_child in attr:
for attr_c,value_c in zip(attr_list,value_list):
if attr_child == attr_c:
get_value = value_c
get_data_list.append(get_value)
break
if get_value:
break
if get_value:
break
else:
for attr_c,value_c in zip(attr_list,value_list):
if attr == attr_c:
get_value = value_c
get_data_list.append(get_value)
break
if get_value == '':
get_data_list.append("爬取不到数据")
res = dict([(k,v) for k,v in zip(self.necessary_attrs,get_data_list)])
return res
if __name__ == '__main__':
crawler = CRAWLER('空调')
res = crawler.crawl_zol('格力(GREE)KFR-72GW/(72558)NHAD-3')
print(res)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment