Commit 7d8f9a34 authored by rico.liu's avatar rico.liu

Initial commit

parents
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Sep 17 18:06:07 2019
@author: rico
Data ETL
"""
import pymysql
import pymssql
import pandas as pd
import re
import requests
from lxml import etree
def get_db_data_all_categoryName():
'''
get zgc product category name list
'''
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute("select SubCategoryName from vw_productselect where (state=1 or state=2 or state=4)")
data_source = [v for v in cursor.fetchall()]
db_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
cursor.close()
conn.close()
res = db_data['SubCategoryName'].unique().tolist()
return res
def convert_db_data_to_df(cursor,tableName):
'''
convert db data to dataframe
'''
cursor.execute("select * from "+tableName)
data_source = [v for v in cursor.fetchall()]
return pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
def filter_data_by_product_id(df_id,df_data):
'''
filter data by product_id
'''
res = pd.merge(df_id,df_data,how='left',on = 'product_id')
return res
def get_source_data(channel,category):
'''
get source data
one channel
one category
'''
#get db name
conn = pymysql.connect(host='59.110.219.171', user='root',password='qwertyuiop1',database='productbase')
cursor = conn.cursor()
cursor.execute("select db_name from channel_info where locate('"+channel+"',channel_name)>0")
db_name = cursor.fetchall()
if not db_name:
print('输入的渠道信息有误')
cursor.close()
conn.close()
return False
else:
db_name = db_name[0][0]
cursor.close()
conn.close()
#get_data
conn = pymysql.connect(host='59.110.219.171', user='root',password='qwertyuiop1',database=db_name)
cursor = conn.cursor()
product_data = convert_db_data_to_df(cursor,'product')
product_attr_data = convert_db_data_to_df(cursor,'product_attr')
product_price_data = convert_db_data_to_df(cursor,'product_price')
product_picture_data = convert_db_data_to_df(cursor,'product_picture')
cursor.close()
conn.close()
product_filter_data = product_data[product_data['channel_product_classify'].str.contains(category)]
product_filter_id = product_filter_data[['product_id']]
product_filter_attr_data = filter_data_by_product_id(product_filter_id,product_attr_data)
product_filter_price_data = filter_data_by_product_id(product_filter_id,product_price_data)
product_filter_picture_data = filter_data_by_product_id(product_filter_id,product_picture_data)
return product_filter_data,product_filter_attr_data,product_filter_price_data,product_filter_picture_data
def get_category_corresponding_relationship(channel):
pass
def convert_category_to_standard(v):
#df = get_category_corresponding_relationship(channel)w
pass
def get_attribute_corresponding_relationship(channel,category):
'''
human create a dict that unstandard attribute with standard attribute
to ues guide format data
'''
pass
def get_attribute_value(main_df,attr_df,attr_name):
'''
according to the main_df to get value with attr name in attr_df
'''
if attr_name == '价格':
attribute_value_list = [attr_df[attr_df['product_id'] == pro_id]['price'].iloc[0] for pro_id in list(main_df['product_id'])]
else:
attribute_value_list = [attr_df[(attr_df['product_id'] == pro_id) & (attr_df['attr_second'] == attr_name)]['value'].str.cat() for pro_id in list(main_df['product_id'])]
return attribute_value_list
def get_basic_info(source_data):
'''
get basic product info to match
'''
product_data,product_attr_data,product_price_data,product_picture_data = source_data
#convert category to standard value
#product_data['zgc_category'] = product_data['channel_product_category'].apply(self.convert_category_to_standard)
#according to the zgc_category to get attribute corresponding relationship and standard value
#zgc_categorys = product_data['zgc_category'].unique().tolist()
#for zgc_category in zgc_categorys:
# df_attr = get_attribute_corresponding_relationship(zgc_category)
basic_info = pd.DataFrame()
basic_info['product_id'] = list(product_data['product_id'])
basic_info['product_name'] = list(product_data['product_name'])
basic_info['channel_product_id'] = list(product_data['channel_product_id'])
basic_info['品牌'] = get_attribute_value(basic_info,product_attr_data,'品牌')
basic_info['产品型号'] = get_attribute_value(basic_info,product_attr_data,'型号')
basic_info['价格'] = get_attribute_value(basic_info,product_price_data,'价格')
return basic_info
def get_db_data_standard_by_attribute(db_attr_data,attr):
'''
according to the stanard attribute to get stanard value from DB
'''
if attr == '品牌':
return db_attr_data['brandname'].unique().tolist()
elif attr == '产品型号匹配':
return db_attr_data[db_attr_data['subtitle'] == '产品型号'][['productcode','productname','brandname','value']]
else:
return db_attr_data[db_attr_data['subtitle'] == attr]['value'].unique().tolist()
def get_db_data_standard_value(category):
'''
according to the category to get stanard value from DB
'''
#get zgc category code
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute("select * from vw_productselect where SubCategoryName in ('"+category+"')and (state=1 or state=2 or state=4)")
data_source = [v for v in cursor.fetchall()]
db_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
zgc_category_code_list = db_data['SubCategoryCode'].unique().tolist()
zgc_category_code ="'" + "','".join(zgc_category_code_list) + "'"
#get db data
sql_attr = '''select a.productcode,c.productname,b.subtitle,a.value from info_productdetail a
join info_product c
on a.productcode=c.productcode
join vw_relation_property b
on a.cfgid=b.subtitleid and c.subcategorycode=b.subcategorycode
join zi_price_quote d
on a.productcode=d.productcode
where c.subcategorycode in ('''+zgc_category_code+''') and (c.state=1 or c.state=2 or c.state=4)'''
cursor.execute(sql_attr)
db_attr_data = pd.DataFrame([row for row in cursor.fetchall()])
db_attr_data.columns=[tuple[0] for tuple in cursor.description]
cursor.close()
conn.close()
db_attr_data['brandname'] = [db_data[db_data['ProductCode'] == pid]['BrandName'].str.cat() for pid in list(db_attr_data['productcode'])]
return db_data,db_attr_data
def match_data_with_basic_info(basic_info,category):
'''
according to the category,return different data
'''
#get db data
db_data,db_attr_data = get_db_data_standard_value(category)
#match brand
db_brand_data = get_db_data_standard_by_attribute(db_attr_data,'品牌')
zgc_brand_list = []
completed_brand_list = []
for brand,product_name in zip(list(basic_info['品牌']),list(basic_info['product_name'])):
brand_flag = 0
for db_brand in db_brand_data:
for db_brand_element in db_brand.split('/'):
if db_brand_element in product_name.upper():
brand_flag = 1
break
if brand_flag == 1:
zgc_brand_list.append(db_brand)
completed_brand_list.append(db_brand)
break
if brand_flag == 0:
zgc_brand_list.append('未匹配上,请补全并标准化')
completed_brand_list.append(brand)
basic_info['zgc_品牌'] = zgc_brand_list
basic_info['completed_产品型号'] = completed_brand_list
#match model
db_model_data = get_db_data_standard_by_attribute(db_attr_data,'产品型号匹配')
zgc_model_list = []
completed_model_list = []
'''
for brand,model in zip(list(basic_info['zgc_brand']),list(basic_info['model'])):
if brand == '未匹配上,请补全并标准化':
zgc_model_list.append('未匹配上,请补全并标准化')
continue
model = [num for num in re.findall(re.compile(r'\d+'), model) if len(num) > 2][0]
model_flag = 0
for db_model in db_model_data[db_model_data['brandname'] == brand]['value'].tolist():
#get first model num
if model in db_model:
zgc_model_list.append(db_model)
model_flag = 1
break
if model_flag == 0:
zgc_model_list.append('未匹配上,请补全并标准化')
basic_info['zgc_model'] = zgc_model_list
'''
for brand,model,product_name in zip(list(basic_info['zgc_品牌']),list(basic_info['产品型号']),list(basic_info['product_name'])):
#complete model
if model == 'null':
product_name = product_name.upper().replace(brand.split('/')[-1].replace(' ',''),'')
model_list =re.findall(r"[A-Za-z0-9]+",product_name)
model_list = [v for v in model_list if len(v) >2]
model = ''.join(model_list)
completed_model_list.append(model)
else:
completed_model_list.append(model)
if brand == '未匹配上,请补全并标准化':
zgc_model_list.append('未匹配上,请标准化')
continue
db_to_source_list = []
source_to_db = []
for db_model in db_model_data[db_model_data['brandname'] == brand]['value'].unique().tolist():
temp_model = ''.join(model.split(' ')).upper()
temp_db_model = ''.join(db_model.split(' '))
if temp_db_model in temp_model:
db_to_source_list.append(db_model)
continue
elif temp_model in temp_db_model or model.upper().strip().split(' ')[-1] in temp_db_model:
source_to_db.append(db_model)
continue
elif 'MODEL' in temp_model:
if len(temp_db_model) == len([m for m in temp_db_model if m in temp_model]):
db_to_source_list.append(db_model)
if db_to_source_list:
#model_flag = 1
max_length_index = db_to_source_list.index(sorted(db_to_source_list,key=lambda k:len(k),reverse=True)[0])
zgc_model_list.append(db_to_source_list[max_length_index])
elif source_to_db:
#model_flag = 1
min_length_index = source_to_db.index(sorted(source_to_db,key=lambda k:len(k),reverse=True)[-1])
zgc_model_list.append(source_to_db[min_length_index])
else:
#model_flag = 0
zgc_model_list.append('未匹配上,请标准化')
basic_info['zgc_产品型号'] = zgc_model_list
basic_info['completed_产品型号'] = completed_model_list
flag_list = []
match_code_list = []
for v in zgc_model_list:
if v == '未匹配上,请标准化':
flag_list.append('0')
match_code_list.append(['无'])
else:
flag_list.append('1')
match_code_list.append([code for model,code in zip(list(db_model_data['value']),list(db_model_data['productcode'])) if v == model])
basic_info['flag_match'] = flag_list
basic_info['match_code'] = match_code_list
return basic_info
def get_match_result(match_result):
'''
according to the match flag to split data
'''
match_data = match_result[match_result['flag_match'] == '1'][['channel_product_id','product_name','zgc_品牌','zgc_产品型号','match_code','价格']]
unmatch_data = match_result[match_result['flag_match'] == '0'].reset_index()
return match_data,unmatch_data
def get_db_data_dict(category):
'''
according to the category get zgc data dict
'''
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
#get category code
cursor.execute("select * from vw_productselect where SubCategoryName in ('"+category+"')")
data_source = [v for v in cursor.fetchall()]
zgc_category_code_list = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])['SubCategoryCode'].unique().tolist()
zgc_category_code ="'" + "','".join(zgc_category_code_list) + "'"
cursor.execute("select * from ShuJuZiDian_Cfg where subcategorycode in ("+zgc_category_code+")")
data_source = [v for v in cursor.fetchall()]
data_dict = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
return data_dict
def get_multiple_attribute_value(main_df,attr_df,data_dict,attrs,std_attr):
'''
according to the multiple attribute to get value
'''
value_list = []
for i in range(len(main_df)):
pro_id = main_df['product_id'][i]
pro_name = main_df['product_name'][i]
value = ''
for attr in attrs:
if attr == 'product_name':
for unstd_v in data_dict[data_dict['subtitle'] == std_attr]['primitive'].tolist():
if unstd_v.upper() in pro_name.upper():
value = unstd_v
break
else:
v = attr_df[(attr_df['product_id'] == pro_id) & (attr_df['attr_second'] == attr)]['value'].str.cat()
if v != '':
value = v
break
if value:
value_list.append(value)
else:
value_list.append('无数据')
return value_list
def format_and_fill_data(source_data,unmatch_data,data_dict):
'''
according to the attribute corresponding relationship
'''
format_data = unmatch_data
product_data,product_attr_data,product_price_data,product_picture_data = source_data
#please cover your dict
temp_dict = {'标配纸盒容量':['纸盒'],
'产品类型':['product_name'],
'双面器':['双面功能','product_name'],
'双面输稿器':['输稿器','product_name'],
'最大复印尺寸':['纸张幅面','最大幅面','最大原稿尺寸','product_name'],
'产品系列':['product_name'],
'网络打印卡':['网络功能','接口类型'],
'复印分辨率':['复印分辨率'],
'复印速度':['复印速度'],
'打印分辨率':['复印分辨率'],
'质保时间':['服务'],
'连续复印':['product_name'],
'首页复印时间':['product_name']}
for std_attr in temp_dict.keys():
format_data[std_attr] = get_multiple_attribute_value(unmatch_data,product_attr_data,data_dict,temp_dict[std_attr],std_attr)
return format_data
def fill_data_by_crawl(format_data,data_dict,standard_attribute_list):
'''
fill attribute value
'''
zgc_brand_list = format_data['zgc_品牌'].tolist()
zgc_model_list = format_data['completed_产品型号'].tolist()
for std_attr in standard_attribute_list:
exec("crawl_%s=[]"%std_attr)
for zgc_brand,zgc_model in zip(zgc_brand_list,zgc_model_list):
if zgc_brand == '未匹配上,请补全并标准化':
print('无品牌信息,无法爬取')
for std_attr in standard_attribute_list:
exec("crawl_%s.append('爬取无数据')"%std_attr)
else:
kw = zgc_brand.split('/')[0] + zgc_model
print(kw)
crawl_data_list = crawl_zol(kw,data_dict,standard_attribute_list)
for std_attr,crwal_data in zip(standard_attribute_list,crawl_data_list):
exec("crawl_%s.append('%s')"%(std_attr,crwal_data))
for std_attr in standard_attribute_list:
exec("format_data['crawl_%s']=crawl_%s"%(std_attr,std_attr))
for std_attr in standard_attribute_list:
exec("format_data['completed_%s']=[crwal_data if source_data == '无数据' else source_data for source_data,crwal_data in zip(list(format_data['%s']),list(format_data['crawl_%s']))]"%(std_attr,std_attr,std_attr))
'''
zgc_brand_list = format_data['zgc_brand'].tolist()
zgc_model_list = format_data['zgc_model'].tolist()
brand_name_list = format_data['brand_name'].tolist()
model_list = format_data['model'].tolist()
crawl_content_list = []
crawl_double_machine = []
for zgc_brand,zgc_model,brand_name,model in zip(zgc_brand_list,zgc_model_list,brand_name_list,model_list):
if zgc_brand == '未匹配上,请补全并标准化' or zgc_model == '未匹配上,请补全并标准化':
brand_name = re.sub("[A-Za-z0-9\!\%\[\]\,\\\\(\)]", "", brand_name).strip()
model = [num for num in re.findall(re.compile(r'\d+'), model) if len(num) > 2][0]
kw = brand_name + model
print(kw)
content, double_machine= crawl_zol(kw)
crawl_content_list.append(content)
crawl_double_machine.append(double_machine)
else:
kw = zgc_brand.split('/')[0] + zgc_model
print(kw)
content, double_machine= crawl_zol(kw)
crawl_content_list.append(content)
crawl_double_machine.append(double_machine)
format_data['crawl_标配纸盒容量'] = crawl_content_list
format_data['zgc_双面器'] = crawl_double_machine
'''
return format_data
def crawl_zol(kw,data_dict,standard_attribute_list):
'''
Site:Zol
'''
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Referer': 'http://detail.zol.com.cn/',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'TE': 'Trailers',
'Cookie':'ip_ck=78CD7v3zj7QuODcyOTc0LjE1NTM1ODc1NjQ%3D; zol_index_today_best_close1=today_yes; zol_userid=weixin_716d9jc1; zol_check=2040718347; zol_cipher=fd5cd1e006683322f25e2b9350b5ad1c; zol_sid=52743385; z_pro_city=s_provice%3Dsichuan%26s_city%3Dchengdu; zol_bind_weixin_716d9jc1=1; gr_user_id=4aedd91b-fbef-43ae-8857-e44d1849bdb3; userProvinceId=17; userCityId=386; userCountyId=0; userLocationId=21; realLocationId=21; userFidLocationId=21; lv=1564041560; vn=6; zol_vest_no=weixin_716d9jc1; z_day=izol106129=1&izol101693=1&rdetail=9; gr_session_id_9b437fe8881a7e19=b304517c-a53c-4945-8f7e-e4c67b4963e7; gr_session_id_9b437fe8881a7e19_b304517c-a53c-4945-8f7e-e4c67b4963e7=true; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1561707760,1562816362,1564019660,1564044365; visited_subcateId=0|212|48|892; visited_subcateProId=0-0|212-0|48-0|892-0; listSubcateId=0; Adshow=0; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1564045129; visited_serachKw=S262NV.html%7CS262NV%7CSF-S262NV%7CSF-S601D%7CFC-5015AC%7CSF-S261NV; questionnaire_pv=1564012830'
}
url = "http://detail.zol.com.cn/index.php?c=SearchList&keyword=" + kw #东芝2823am
res = requests.get(url, headers=headers)
html = etree.HTML(res.text)
try:
tag = html.xpath("//*[@class='list-item clearfix']//div[@class='pic-box SP']/a/@href")
basic_url = "http://detail.zol.com.cn"
combine_url = basic_url + tag[0]
except:
for std_attr in standard_attribute_list:
exec("crawl_%s='爬取不到数据'"%std_attr)
res = []
for std_attr in standard_attribute_list:
exec("res.append(crawl_%s)"%std_attr)
print('抱歉,未找到该产品')
return res
detail = requests.get(combine_url, headers=headers)
html = etree.HTML(detail.text)
more = html.xpath("//a[@class='_j_MP_more more']/@href")
more_url = basic_url + more[0]
source = requests.get(more_url, headers=headers).text
source = source.replace('<br />','')
html = etree.HTML(source)
#get Zol attribute and value
Zol_data = pd.DataFrame()
attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()")
attr_list = [v.strip() for v in attr_list]
value_list = html.xpath("//span[contains(@id,'newPmVal')]//text()")
value_list = value_list[0:len(attr_list)]
value_list = [v.strip().replace('\n','').replace('\r','') for v in value_list]
Zol_data['attr'] = attr_list
Zol_data['value'] = value_list
#get need data
get_data_list = []
for std_attr in standard_attribute_list:
get_value = ''
for attr,value in zip(attr_list,value_list):
if std_attr in attr or attr in std_attr:
get_value = value
get_data_list.append(get_value)
break
else:
standard_list = data_dict[data_dict['subtitle'] == std_attr]['stdvalue'].unique().tolist()
unstandrd_list = data_dict[data_dict['subtitle'] == std_attr]['primitive'].unique().tolist()
for std_v,unstd_v in zip(standard_list,unstandrd_list):
if unstd_v in value.upper() or std_v in value.upper():
get_value = value
get_data_list.append(get_value)
break
if get_value == '':
data_dict
get_data_list.append("爬取不到数据")
for std_attr,value in zip(standard_attribute_list,get_data_list):
exec("crawl_%s='%s'"%(std_attr,value))
res = []
for std_attr in standard_attribute_list:
exec("res.append(crawl_%s)"%std_attr)
return res
'''
standard_value_by_dict
'''
def transform_by_dict(data_dict,target_data,targer_column):
match_v = data_dict[data_dict['subtitle'] == targer_column]['primitive'].apply(lambda x:''.join(x.split(' ')).upper()).tolist()
res = []
for v in target_data['completed_' + targer_column].tolist():
v = ''.join(v.split(' ')).upper()
try:
res.append(data_dict[data_dict['subtitle'] == targer_column]['stdvalue'].tolist()[match_v.index(v)])
except:
res.append('未匹配数据字典')
return res
def standard_value_by_dict(data_dict,format_crawled_data):
df = format_crawled_data[[v for v in format_crawled_data.columns.tolist() if 'complete' in v]]
for name in df.columns.tolist():
col_name = name.split('_')[1]
exec("format_crawled_data['zgc_%s'] = transform_by_dict(data_dict,df,'%s')"%(col_name,col_name))
return_col_list = ['product_id','product_name','channel_product_id','价格'] + [v for v in format_crawled_data.columns.tolist() if 'zgc' in v or 'completed' in v]
return format_crawled_data[return_col_list]
source_data = get_source_data('安徽','复印机')
basic_info = get_basic_info(source_data)
match_result = match_data_with_basic_info(basic_info,'复印机')
match_data,unmatch_data = get_match_result(match_result)
match_data.to_excel('match_data.xlsx')
data_dict = get_db_data_dict('复印机')
standard_attribute_list = [v for v in data_dict['subtitle'].apply(lambda x:x.strip()).unique().tolist() if v not in unmatch_data.columns.tolist()]
format_data = format_and_fill_data(source_data,unmatch_data,data_dict)
format_crawled_data = fill_data_by_crawl(format_data,data_dict,standard_attribute_list)
res = standard_value_by_dict(data_dict,format_crawled_data)
res.to_excel('res_data.xlsx')
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment