Commit 8d7a546a authored by rico.liu's avatar rico.liu

add crawl params and pic

parent 2ebdc9ca
...@@ -363,6 +363,62 @@ def GetCollectData(batch,channel_alias): ...@@ -363,6 +363,62 @@ def GetCollectData(batch,channel_alias):
return df_db return df_db
#爬去链接参数及图片 (暂时只支持一个产品对应一个链接,多链接取第一个链接信息)
def GetParamsinfoAndPic(df):
mssql_new = MSSQL('123.56.115.207','ZI_NEW')
cursor_zi_new = mssql_new._cur
mssql = MSSQL('123.57.45.119','ZI_Service')
cursor_zi_service = mssql._cur
#价格渠道字典
cursor_zi_new.execute(f"select channel_alias_cn,channel_alias_code from zdindex_channel_rel")
price_source = pd.DataFrame(cursor_zi_new.fetchall(), columns=[tuple[0] for tuple in cursor_zi_new.description])
#请求地址
request_url = "http://59.110.219.171:8092/return_data"
#组织请求数据
price_source_dict = dict(zip(price_source['channel_alias_code'].tolist(),price_source['channel_alias_cn'].tolist()))
data_list = str({'data':[[str(eval(url)[0]),str(price_source_dict[eval(source)[0]])] for url,source in zip(df['url'].tolist(),df['url_source'].tolist())]}).replace("'","\"")
payload={'dataList': data_list}
response = requests.request("POST", request_url, data=payload)
res = eval(response.text)
df['url_pic'] = [str(element['img_list']) for element in res]
crawl_params_list = [str(element['class_list']).replace("'': ''","").replace(", ,",",").replace("{,","{").replace(" ","") for element in res]
url_params_list = []
for element,url_params in zip(res,crawl_params_list):
params_dict = eval(url_params)
params_dict.update({'爬取链接':element['url']})
url_params_list.append(str(params_dict))
df['url_params'] = url_params_list
#储存爬取的信息
#实例化进度条
index_ = Index()
counter = 1
for index,row in df.iterrows():
try:
print(index_(counter, len(df)-1), end='%')
except:
print(index_(counter, 1), end='%')
counter += 1
id_ = row['id']
url_pic = row['url_pic'].replace("'","''")
url_params = row['url_params'].replace("'","''")
cursor_zi_service.execute(f"update product_all set url_params = '{url_params}',url_pic = '{url_pic}' where id = {id_}")
print('爬去数据存储完成')
mssql.Close()
mssql_new.Close()
return df
#解析重点类产品数据并导出 #解析重点类产品数据并导出
def GetCollectDataDetail(df,channel_alias,batch): def GetCollectDataDetail(df,channel_alias,batch):
...@@ -419,7 +475,8 @@ def GetCollectDataDetail(df,channel_alias,batch): ...@@ -419,7 +475,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
writer = pd.ExcelWriter(f"{channel_alias}建库产品参数确认{batch}.xlsx") writer = pd.ExcelWriter(f"{channel_alias}建库产品参数确认{batch}.xlsx")
index = 0
for category in df['zi_subcategoryname'].unique().tolist(): for category in df['zi_subcategoryname'].unique().tolist():
#获取每一个品类的dataframe #获取每一个品类的dataframe
...@@ -430,6 +487,8 @@ def GetCollectDataDetail(df,channel_alias,batch): ...@@ -430,6 +487,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
single_subtitle_df = subtitle_df[subtitle_df['name'] == category] single_subtitle_df = subtitle_df[subtitle_df['name'] == category]
#获取这个类的必填属性规格(CPU属性无需填写,系统在建库时自动给出) #获取这个类的必填属性规格(CPU属性无需填写,系统在建库时自动给出)
requier_param_list = single_subtitle_df[single_subtitle_df['require_param'] == '1']['subtitle'].tolist() requier_param_list = single_subtitle_df[single_subtitle_df['require_param'] == '1']['subtitle'].tolist()
#获取这个类的参数项对应关系
single_subtitle_map_df = subtitle_map_df[subtitle_map_df['categoryname'] == category]
#获取这个类的非必填 #获取这个类的非必填
non_requier_param_list = [] non_requier_param_list = []
...@@ -445,8 +504,10 @@ def GetCollectDataDetail(df,channel_alias,batch): ...@@ -445,8 +504,10 @@ def GetCollectDataDetail(df,channel_alias,batch):
pass pass
requier_param_list = [str(param) + "(*)" for param in requier_param_list] requier_param_list = [str(param) + "(*)" for param in requier_param_list]
#将爬取信息放到最后
param_list_all = requier_param_list + non_requier_param_list param_list_all = requier_param_list + non_requier_param_list
param_list_all.remove("原始参数及链接信息")
param_list_all.append("原始参数及链接信息")
id_list = list() id_list = list()
name_list = list() name_list = list()
...@@ -455,12 +516,37 @@ def GetCollectDataDetail(df,channel_alias,batch): ...@@ -455,12 +516,37 @@ def GetCollectDataDetail(df,channel_alias,batch):
param_list = list() param_list = list()
value_list = list() value_list = list()
new_name_list = list() new_name_list = list()
#实例化进度条
index_ = Index()
counter = 1
for index,row in cat_df.iterrows(): for index,row in cat_df.iterrows():
try:
print(index_(counter, len(cat_df)-1), end='%')
except:
print(index_(counter, 1), end='%')
counter += 1
id_ = row['id'] id_ = row['id']
name = row['name'] name = row['name']
zi_brandname = row['zi_brandname'] zi_brandname = row['zi_brandname']
zi_subcategoryname = row['zi_subcategoryname'] zi_subcategoryname = row['zi_subcategoryname']
url_params = eval(row['url_params'])
std_key_list = []
std_value_list = []
for key in url_params.keys():
judge_df = single_subtitle_map_df[single_subtitle_map_df['outsubtitle'] == key][['subtitle']]
if judge_df.empty:
continue
else:
std_key_list.append(judge_df['subtitle'].tolist()[0])
std_value_list.append(url_params[key])
std_url_params = dict(zip(std_key_list,std_value_list))
for param in param_list_all: for param in param_list_all:
...@@ -470,9 +556,22 @@ def GetCollectDataDetail(df,channel_alias,batch): ...@@ -470,9 +556,22 @@ def GetCollectDataDetail(df,channel_alias,batch):
zi_brandname_list.append(zi_brandname) zi_brandname_list.append(zi_brandname)
zi_subcategoryname_list.append(zi_subcategoryname) zi_subcategoryname_list.append(zi_subcategoryname)
param_list.append(param) param_list.append(param)
value_list.append('')
new_name_list.append('') new_name_list.append('')
if param == '原始参数及链接信息':
value = row['url_params']
else:
try:
value = url_params[param]
except:
try:
value = std_url_params[param]
except:
value = ''
value_list.append(value)
export_df = pd.DataFrame() export_df = pd.DataFrame()
export_df['id'] = id_list export_df['id'] = id_list
export_df['name'] = name_list export_df['name'] = name_list
...@@ -484,6 +583,8 @@ def GetCollectDataDetail(df,channel_alias,batch): ...@@ -484,6 +583,8 @@ def GetCollectDataDetail(df,channel_alias,batch):
category = category.replace('/','_') category = category.replace('/','_')
export_df.to_excel(writer,f"{category}参数数据") export_df.to_excel(writer,f"{category}参数数据")
print(f"完成{category}参数整理")
writer.save() writer.save()
mssql.Close() mssql.Close()
...@@ -1713,8 +1814,11 @@ UpdateBasicData(path) ...@@ -1713,8 +1814,11 @@ UpdateBasicData(path)
#获取建库数据 #获取建库数据
df = GetCollectData(batch,channel_alias) df = GetCollectData(batch,channel_alias)
#爬去链接参数信息及图片
df_crawled = GetParamsinfoAndPic(df)
#导出建库数据参数补充 #导出建库数据参数补充
GetCollectDataDetail(df,channel_alias,batch) GetCollectDataDetail(df_crawled,channel_alias,batch)
#处理建库数据 #处理建库数据
...@@ -1749,14 +1853,6 @@ cursor_zi_service = mssql._cur ...@@ -1749,14 +1853,6 @@ cursor_zi_service = mssql._cur
#入库 #入库
id_list = df['id'].tolist() id_list = df['id'].tolist()
source_name_list = df['new_name'].tolist()
price_list = df['url_price'].apply(lambda x:eval(x)[0]).tolist()
url_list = df['url'].apply(lambda x:eval(x)[0]).tolist()
channelId_list = df['url_source'].apply(lambda x:eval(x)[0]).tolist()
brand_list = df['zi_brandname'].tolist() brand_list = df['zi_brandname'].tolist()
brandId_list = df['zi_brandcode'].apply(lambda x:str(x)).tolist() brandId_list = df['zi_brandcode'].apply(lambda x:str(x)).tolist()
...@@ -1765,10 +1861,15 @@ category_list = df['zi_subcategoryname'].tolist() ...@@ -1765,10 +1861,15 @@ category_list = df['zi_subcategoryname'].tolist()
categoryId_list = df['zi_subcategorycode'].apply(lambda x:str(x)).tolist() categoryId_list = df['zi_subcategorycode'].apply(lambda x:str(x)).tolist()
params_list = df['params_standard'].apply(lambda x: eval(x)).tolist()
name_list = df['new_name'].apply(lambda x: x.replace("'","''")).tolist() name_list = df['new_name'].apply(lambda x: x.replace("'","''")).tolist()
params_list = []
for params_standard,url_params in zip(df['params_standard'].tolist(),df['url_params'].tolist()):
params_standard_dict = eval(params_standard)
params_standard_dict.update({'原始参数及链接信息':url_params})
params_list.append(params_standard_dict)
data = { data = {
"params_info": { "params_info": {
"brand_list": brand_list, "brand_list": brand_list,
...@@ -1786,6 +1887,31 @@ sku_list = res['sku_list'] ...@@ -1786,6 +1887,31 @@ sku_list = res['sku_list']
for _id,sku in zip(id_list,sku_list): for _id,sku in zip(id_list,sku_list):
cursor_zi_service.execute(f"update product_all set productcode = '{sku}',remark = Null,state = '9' where id = {_id}") cursor_zi_service.execute(f"update product_all set productcode = '{sku}',remark = Null,state = '9' where id = {_id}")
df['productcode'] = sku_list
#组织价格数据
sku_list = []
source_name_list = []
price_list = []
url_list = []
channelId_list = []
for index,row in df.iterrows():
sku_list_temp = row['productcode']
source_name_list_temp = row['new_name'].replace("'","''")
url_list_temp = eval(row['url'])
channelId_list_temp = eval(row['url_source'])
price_list_temp = eval(row['url_price'])
for url,channelId,price in zip(url_list_temp,channelId_list_temp,price_list_temp):
sku_list.append(sku_list_temp)
source_name_list.append(source_name_list_temp)
url_list.append(url)
channelId_list.append(channelId)
price_list.append(price)
#价格关系入库 #价格关系入库
data = { data = {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment