Commit fea41de6 authored by rico.liu's avatar rico.liu

update according new product database

parent d7c72dac
...@@ -4,14 +4,16 @@ ZOL爬虫补惨 ...@@ -4,14 +4,16 @@ ZOL爬虫补惨
1、实体化对象 1、实体化对象
参数:指数类别编码 string 参数:指数类别名称 string
CRAWLER('0506') CRAWLER('空调')
2、调用对象爬取方法,返回结果字典 2、调用对象爬取方法,返回结果字典
方法:crawl_zol() 方法:crawl_zol()
参数:要补参产品的品牌型号 string 参数:要补参产品的品牌型号 string
crawl_zol('格力(GREE)KFR-72GW/(72558)NHAD-3')
返回:{param:value} 返回:{param:value}
{'产品系列': '绿嘉园系列', '产品型号': '爬取不到数据', '空调类别': '壁挂式空调', '冷暖类型': '冷暖电辅', '能效等级': '三级能效', '匹数': '大3.0P', '变频/定频': '定频'}
\ No newline at end of file
...@@ -13,11 +13,11 @@ from lxml import etree ...@@ -13,11 +13,11 @@ from lxml import etree
class CRAWLER: class CRAWLER:
def __init__(self, categorycode): def __init__(self, zgc_categoryname):
self.conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase') self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database= 'ZI_NEW',autocommit=True)
self.zol_rel_data = self.get_zol_attribute_relation(categorycode) self.zol_rel_data = self.get_zol_attribute_relation(zgc_categoryname)
self.necessary_attrs = self.get_necessary_attrs(categorycode) self.necessary_attrs = self.get_necessary_attrs(zgc_categoryname)
self.headers = { self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
...@@ -32,18 +32,18 @@ class CRAWLER: ...@@ -32,18 +32,18 @@ class CRAWLER:
} }
self.basic_url = "http://detail.zol.com.cn/index.php?c=SearchList&keyword=" self.basic_url = "http://detail.zol.com.cn/index.php?c=SearchList&keyword="
def get_zol_attribute_relation(self,zgc_category_code): def get_zol_attribute_relation(self,zgc_categoryname):
''' '''
获取ZOL参数对应关系 获取ZOL参数对应关系
''' '''
try: try:
zol_rel_data = pd.DataFrame() zol_rel_data = pd.DataFrame()
cursor = self.conn.cursor() cursor = self.conn.cursor()
cursor.execute("SELECT * FROM Product_Relation_Attribute_SubTitle where ZI_SubCategoryCode = '"+zgc_category_code+"' and Source = 'ZOL'") cursor.execute(f"SELECT * FROM p_skusubtitle_out_map where categoryname = '{zgc_categoryname}' and frm = 'ZOL'")
try: try:
data_source = [v for v in cursor.fetchall()] data_source = [v for v in cursor.fetchall()]
except: except:
print(str(zgc_category_code) + " 该类别产品参数项无zol对应关系") print(str(zgc_categoryname) + " 该类别产品参数项无zol对应关系")
return zol_rel_data return zol_rel_data
zol_rel_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description]) zol_rel_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
...@@ -53,17 +53,17 @@ class CRAWLER: ...@@ -53,17 +53,17 @@ class CRAWLER:
except: except:
print('链接失败,重新链接') print('链接失败,重新链接')
return self.get_zol_attribute_relation(zgc_category_code) return self.get_zol_attribute_relation(zgc_categoryname)
def get_necessary_attrs(self,zgc_category_code): def get_necessary_attrs(self,zgc_categoryname):
''' '''
获取对应类别参数项 获取对应类别参数项
''' '''
try: try:
cursor = self.conn.cursor() cursor = self.conn.cursor()
cursor.execute("select distinct SubTitle from vw_relation_property where SubCategoryCode = '"+zgc_category_code+"' and (ISimportant = 1 or ispeijian = 1)") cursor.execute(f"select subtitle from vw_property where name = '{zgc_categoryname}' and left(identy,1) = 1")
data_source = [v for v in cursor.fetchall()] data_source = [v for v in cursor.fetchall()]
attribute_data_list = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])['SubTitle'].tolist() attribute_data_list = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])['subtitle'].tolist()
cursor.close() cursor.close()
self.conn.close() self.conn.close()
...@@ -72,7 +72,7 @@ class CRAWLER: ...@@ -72,7 +72,7 @@ class CRAWLER:
except: except:
print('链接失败,重新链接') print('链接失败,重新链接')
return self.get_necessary_attrs(zgc_category_code) return self.get_necessary_attrs(zgc_categoryname)
def get_reponse(self,url): def get_reponse(self,url):
''' '''
...@@ -164,7 +164,7 @@ class CRAWLER: ...@@ -164,7 +164,7 @@ class CRAWLER:
#create zol relationship attribute dict #create zol relationship attribute dict
dic = dict() dic = dict()
for attr in self.necessary_attrs: for attr in self.necessary_attrs:
dic[attr] = [v if "," not in str(v) else [v_ for v_ in v.split(",")] for v in self.zol_rel_data[self.zol_rel_data['ZI_SubTitle'] == attr]['Other_SubTitle'].unique().tolist()] dic[attr] = [v if "," not in str(v) else [v_ for v_ in v.split(",")] for v in self.zol_rel_data[self.zol_rel_data['subtitle'] == attr]['outsubtitle'].unique().tolist()]
#get need data #get need data
get_data_list = [] get_data_list = []
...@@ -200,5 +200,6 @@ class CRAWLER: ...@@ -200,5 +200,6 @@ class CRAWLER:
if __name__ == '__main__': if __name__ == '__main__':
crawler = CRAWLER('0506') crawler = CRAWLER('空调')
res = crawler.crawl_zol('东芝2823am') res = crawler.crawl_zol('格力(GREE)KFR-72GW/(72558)NHAD-3')
\ No newline at end of file print(res)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment