Commit fea41de6 authored by rico.liu's avatar rico.liu

update according new product database

parent d7c72dac
......@@ -4,14 +4,16 @@ ZOL爬虫补惨
1、实体化对象
参数:指数类别编码 string
参数:指数类别名称 string
CRAWLER('0506')
CRAWLER('空调')
2、调用对象爬取方法,返回结果字典
方法:crawl_zol()
参数:要补参产品的品牌型号 string
crawl_zol('格力(GREE)KFR-72GW/(72558)NHAD-3')
返回:{param:value}
\ No newline at end of file
返回:{param:value}
{'产品系列': '绿嘉园系列', '产品型号': '爬取不到数据', '空调类别': '壁挂式空调', '冷暖类型': '冷暖电辅', '能效等级': '三级能效', '匹数': '大3.0P', '变频/定频': '定频'}
\ No newline at end of file
......@@ -13,11 +13,11 @@ from lxml import etree
class CRAWLER:
def __init__(self, categorycode):
def __init__(self, zgc_categoryname):
self.conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
self.zol_rel_data = self.get_zol_attribute_relation(categorycode)
self.necessary_attrs = self.get_necessary_attrs(categorycode)
self.conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database= 'ZI_NEW',autocommit=True)
self.zol_rel_data = self.get_zol_attribute_relation(zgc_categoryname)
self.necessary_attrs = self.get_necessary_attrs(zgc_categoryname)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
......@@ -32,18 +32,18 @@ class CRAWLER:
}
self.basic_url = "http://detail.zol.com.cn/index.php?c=SearchList&keyword="
def get_zol_attribute_relation(self,zgc_category_code):
def get_zol_attribute_relation(self,zgc_categoryname):
'''
获取ZOL参数对应关系
'''
try:
zol_rel_data = pd.DataFrame()
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM Product_Relation_Attribute_SubTitle where ZI_SubCategoryCode = '"+zgc_category_code+"' and Source = 'ZOL'")
cursor.execute(f"SELECT * FROM p_skusubtitle_out_map where categoryname = '{zgc_categoryname}' and frm = 'ZOL'")
try:
data_source = [v for v in cursor.fetchall()]
except:
print(str(zgc_category_code) + " 该类别产品参数项无zol对应关系")
print(str(zgc_categoryname) + " 该类别产品参数项无zol对应关系")
return zol_rel_data
zol_rel_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
......@@ -53,17 +53,17 @@ class CRAWLER:
except:
print('链接失败,重新链接')
return self.get_zol_attribute_relation(zgc_category_code)
return self.get_zol_attribute_relation(zgc_categoryname)
def get_necessary_attrs(self,zgc_category_code):
def get_necessary_attrs(self,zgc_categoryname):
'''
获取对应类别参数项
'''
try:
cursor = self.conn.cursor()
cursor.execute("select distinct SubTitle from vw_relation_property where SubCategoryCode = '"+zgc_category_code+"' and (ISimportant = 1 or ispeijian = 1)")
cursor.execute(f"select subtitle from vw_property where name = '{zgc_categoryname}' and left(identy,1) = 1")
data_source = [v for v in cursor.fetchall()]
attribute_data_list = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])['SubTitle'].tolist()
attribute_data_list = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])['subtitle'].tolist()
cursor.close()
self.conn.close()
......@@ -72,7 +72,7 @@ class CRAWLER:
except:
print('链接失败,重新链接')
return self.get_necessary_attrs(zgc_category_code)
return self.get_necessary_attrs(zgc_categoryname)
def get_reponse(self,url):
'''
......@@ -164,7 +164,7 @@ class CRAWLER:
#create zol relationship attribute dict
dic = dict()
for attr in self.necessary_attrs:
dic[attr] = [v if "," not in str(v) else [v_ for v_ in v.split(",")] for v in self.zol_rel_data[self.zol_rel_data['ZI_SubTitle'] == attr]['Other_SubTitle'].unique().tolist()]
dic[attr] = [v if "," not in str(v) else [v_ for v_ in v.split(",")] for v in self.zol_rel_data[self.zol_rel_data['subtitle'] == attr]['outsubtitle'].unique().tolist()]
#get need data
get_data_list = []
......@@ -200,5 +200,6 @@ class CRAWLER:
if __name__ == '__main__':
crawler = CRAWLER('0506')
res = crawler.crawl_zol('东芝2823am')
\ No newline at end of file
crawler = CRAWLER('空调')
res = crawler.crawl_zol('格力(GREE)KFR-72GW/(72558)NHAD-3')
print(res)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment