Commit b1bdbbe0 authored by rico.liu's avatar rico.liu

update commit

parent e9e5bebd
......@@ -5,7 +5,7 @@ Created on Tue Sep 17 18:06:07 2019
@author: rico
Data ETL
Data clean
"""
import pymysql
......@@ -14,24 +14,72 @@ import pandas as pd
import re
import requests
from lxml import etree
from urllib.parse import quote
def get_db_data_all_categoryName():
def get_attribute_relation(channel,category):
'''
get zgc product category name list
'''
#channel = '河南'
#category = '一体机'
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute("SELECT * FROM Product_Relation_Attribute_SubTitle where SourceSubCategory = '"+category+"' and Source = '"+channel+"'")
data_source = [v for v in cursor.fetchall()]
rel_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
return rel_data
def get_zgc_name_and_code(rel_data):
'''
'''
zgc_dict = get_db_data_all_categoryInfo()
zgc_code = rel_data['ZI_SubCategoryCode'].tolist()[0]
zgc_name = zgc_dict[zgc_code][0]
return zgc_name,zgc_code
def get_db_data_all_categoryInfo():
'''
get zgc product category name list and code list
'''
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute("select SubCategoryName from vw_productselect where (state=1 or state=2 or state=4)")
cursor.execute("select SubCategoryName,SubCategoryCode from vw_productselect")
data_source = [v for v in cursor.fetchall()]
db_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
cursor.close()
conn.close()
res = db_data['SubCategoryName'].unique().tolist()
res = db_data[['SubCategoryName','SubCategoryCode']].drop_duplicates().reset_index()
del res['index']
dict_res = res.set_index('SubCategoryCode').T.to_dict('list')
return res
return dict_res
def get_attribute_rel_data(zgc_category_code,rel_data):
'''
'''
#zgc_catrgory_code = '0504'
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute("select * from vw_relation_property where SubCategoryCode = '"+zgc_category_code+"'")
data_source = [v for v in cursor.fetchall()]
attribute_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
cursor.close()
conn.close()
merge_data = pd.merge(attribute_data,rel_data[['ZI_SubTitle','Other_SubTitle']],how = 'left',left_on='SubTitle', right_on='ZI_SubTitle')
merge_data = merge_data[(merge_data['ZI_SubTitle'].isnull() == False) | (merge_data['ISimportant'] == 1)]
return merge_data
def convert_db_data_to_df(cursor,tableName):
......@@ -53,7 +101,7 @@ def filter_data_by_product_id(df_id,df_data):
return res
def get_source_data(channel,category):
def get_source_data(channel,category,zgc_categoty_name):
'''
get source data
one channel
......@@ -86,7 +134,7 @@ def get_source_data(channel,category):
cursor.close()
conn.close()
product_filter_data = product_data[product_data['channel_product_classify'].str.contains(category)]
product_filter_data = product_data[product_data['channel_product_classify'].str.contains(category+"|"+zgc_categoty_name)]
product_filter_id = product_filter_data[['product_id']]
product_filter_attr_data = filter_data_by_product_id(product_filter_id,product_attr_data)
product_filter_price_data = filter_data_by_product_id(product_filter_id,product_price_data)
......@@ -95,36 +143,41 @@ def get_source_data(channel,category):
return product_filter_data,product_filter_attr_data,product_filter_price_data,product_filter_picture_data
def get_category_corresponding_relationship(channel):
pass
def convert_category_to_standard(v):
#df = get_category_corresponding_relationship(channel)w
pass
def get_attribute_corresponding_relationship(channel,category):
'''
human create a dict that unstandard attribute with standard attribute
to ues guide format data
'''
pass
def get_attribute_value(main_df,attr_df,attr_name):
def get_attribute_value(main_df,attr_df,attribute_rel_data,attr_name):
'''
according to the main_df to get value with attr name in attr_df
'''
if attr_name == '价格':
attribute_value_list = [attr_df[attr_df['product_id'] == pro_id]['price'].iloc[0] for pro_id in list(main_df['product_id'])]
elif attr_name == '品牌':
attribute_value_list = []
for pro_id,pro_name in zip(list(main_df['product_id']),list(main_df['product_name'])):
v = attr_df[(attr_df['product_id'] == pro_id) & (attr_df['attr_second'].str.contains('品牌'))]['value'].str.cat()
if v:
attribute_value_list.append(v)
else:
attribute_value_list.append(pro_name)
else:
attr_name_list = [str(unstd_v).split(',')+[flag] for std_v,unstd_v,flag in zip(list(attribute_rel_data['SubTitle']),list(attribute_rel_data['Other_SubTitle']),list(attribute_rel_data['ISimportant'])) if attr_name == std_v][0]
attribute_value_list = []
for pro_id,pro_name in zip(list(main_df['product_id']),list(main_df['product_name'])):
for i in range(len(attr_name_list)-1):
if attr_name_list[i] == 'nan':
v = pro_name
break
else:
attribute_value_list = [attr_df[(attr_df['product_id'] == pro_id) & (attr_df['attr_second'] == attr_name)]['value'].str.cat() for pro_id in list(main_df['product_id'])]
v = attr_df[(attr_df['product_id'] == pro_id) & (attr_df['attr_second'].str.contains(attr_name_list[i]))]['value'].str.cat()
if v:
print(pro_id,pro_name,attr_name[i],v)
break
if v:
attribute_value_list.append(v)
else:
attribute_value_list.append(pro_name)
return attribute_value_list
def get_basic_info(source_data):
def get_basic_info(source_data,attribute_rel_data):
'''
get basic product info to match
'''
......@@ -141,10 +194,11 @@ def get_basic_info(source_data):
basic_info = pd.DataFrame()
basic_info['product_id'] = list(product_data['product_id'])
basic_info['product_name'] = list(product_data['product_name'])
basic_info['channel_product_classify'] = list(product_data['channel_product_classify'])
basic_info['channel_product_id'] = list(product_data['channel_product_id'])
basic_info['品牌'] = get_attribute_value(basic_info,product_attr_data,'品牌')
basic_info['产品型号'] = get_attribute_value(basic_info,product_attr_data,'型号')
basic_info['价格'] = get_attribute_value(basic_info,product_price_data,'价格')
basic_info['品牌'] = get_attribute_value(basic_info,product_attr_data,attribute_rel_data,'品牌')
basic_info['产品型号'] = get_attribute_value(basic_info,product_attr_data,attribute_rel_data,'产品型号')
basic_info['价格'] = get_attribute_value(basic_info,product_price_data,attribute_rel_data,'价格')
return basic_info
......@@ -156,25 +210,20 @@ def get_db_data_standard_by_attribute(db_attr_data,attr):
if attr == '品牌':
return db_attr_data['brandname'].unique().tolist()
elif attr == '产品型号匹配':
return db_attr_data[db_attr_data['subtitle'] == '产品型号'][['productcode','productname','brandname','value']]
return db_attr_data[(db_attr_data['subtitle'] == '产品型号' ) | (db_attr_data['subtitle'] == '型号' )][['productcode','productname','brandname','value']]
else:
return db_attr_data[db_attr_data['subtitle'] == attr]['value'].unique().tolist()
def get_db_data_standard_value(category):
def get_db_data_standard_value(zgc_category_code):
'''
according to the category to get stanard value from DB
'''
#get zgc category code
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute("select * from vw_productselect where SubCategoryName in ('"+category+"')and (state=1 or state=2 or state=4)")
cursor.execute("select * from vw_productselect where SubCategoryCode = '"+zgc_category_code+"' and (state=1 or state=2 or state=4)")
data_source = [v for v in cursor.fetchall()]
db_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
zgc_category_code_list = db_data['SubCategoryCode'].unique().tolist()
zgc_category_code ="'" + "','".join(zgc_category_code_list) + "'"
#get db data
sql_attr = '''select a.productcode,c.productname,b.subtitle,a.value from info_productdetail a
join info_product c
......@@ -196,12 +245,12 @@ def get_db_data_standard_value(category):
return db_data,db_attr_data
def match_data_with_basic_info(basic_info,category):
def match_data_with_basic_info(basic_info,zgc_category_code):
'''
according to the category,return different data
'''
#get db data
db_data,db_attr_data = get_db_data_standard_value(category)
db_data,db_attr_data = get_db_data_standard_value(zgc_category_code)
#match brand
db_brand_data = get_db_data_standard_by_attribute(db_attr_data,'品牌')
......@@ -209,11 +258,11 @@ def match_data_with_basic_info(basic_info,category):
zgc_brand_list = []
completed_brand_list = []
for brand,product_name in zip(list(basic_info['品牌']),list(basic_info['product_name'])):
for brand,product_name,classify in zip(list(basic_info['品牌']),list(basic_info['product_name']),list(basic_info['channel_product_classify'])):
brand_flag = 0
for db_brand in db_brand_data:
for db_brand_element in db_brand.split('/'):
if db_brand_element in product_name.upper():
if db_brand_element in product_name.upper() or db_brand_element in classify.upper():
brand_flag = 1
break
......@@ -252,9 +301,9 @@ def match_data_with_basic_info(basic_info,category):
for brand,model,product_name in zip(list(basic_info['zgc_品牌']),list(basic_info['产品型号']),list(basic_info['product_name'])):
#complete model
if model == 'null':
if model == product_name:#model == 'null'
product_name = product_name.upper().replace(brand.split('/')[-1].replace(' ',''),'')
model_list =re.findall(r"[A-Za-z0-9]+",product_name)
model_list =re.findall(r"[A-Za-z0-9-]+",product_name)
model_list = [v for v in model_list if len(v) >2]
model = ''.join(model_list)
......@@ -322,17 +371,12 @@ def get_match_result(match_result):
return match_data,unmatch_data
def get_db_data_dict(category):
def get_db_data_dict(zgc_category_code):
'''
according to the category get zgc data dict
'''
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
#get category code
cursor.execute("select * from vw_productselect where SubCategoryName in ('"+category+"')")
data_source = [v for v in cursor.fetchall()]
zgc_category_code_list = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])['SubCategoryCode'].unique().tolist()
zgc_category_code ="'" + "','".join(zgc_category_code_list) + "'"
cursor.execute("select * from ShuJuZiDian_Cfg where subcategorycode in ("+zgc_category_code+")")
data_source = [v for v in cursor.fetchall()]
......@@ -341,27 +385,35 @@ def get_db_data_dict(category):
return data_dict
def get_multiple_attribute_value(main_df,attr_df,data_dict,attrs,std_attr):
def get_multiple_attribute_value(unmatch_data,product_attr_data,data_dict,attrs,std_attr):
'''
according to the multiple attribute to get value
'''
value_list = []
for i in range(len(main_df)):
pro_id = main_df['product_id'][i]
pro_name = main_df['product_name'][i]
for i in range(len(unmatch_data)):
pro_id = unmatch_data['product_id'][i]
pro_name = unmatch_data['product_name'][i]
value = ''
for attr in attrs:
if attr == 'product_name':
for unstd_v in data_dict[data_dict['subtitle'] == std_attr]['primitive'].tolist():
if unstd_v.upper() in pro_name.upper():
value = unstd_v
if isinstance(attr,list):
for attr_ in attr:
v = product_attr_data[(product_attr_data['product_id'] == pro_id) & (product_attr_data['attr_second'] == attr_)]['value'].str.cat()
if v != '':
value = v
break
if value:
break
else:
v = attr_df[(attr_df['product_id'] == pro_id) & (attr_df['attr_second'] == attr)]['value'].str.cat()
v = product_attr_data[(product_attr_data['product_id'] == pro_id) & (product_attr_data['attr_second'] == attr)]['value'].str.cat()
if v != '':
value = v
break
if value == '':
for unstd_v in data_dict[data_dict['subtitle'] == std_attr]['primitive'].tolist():
if unstd_v.upper() in pro_name.upper():
value = unstd_v
break
if value:
value_list.append(value)
else:
......@@ -370,14 +422,22 @@ def get_multiple_attribute_value(main_df,attr_df,data_dict,attrs,std_attr):
return value_list
def format_and_fill_data(source_data,unmatch_data,data_dict):
def format_and_fill_data(attribute_rel_data,source_data,unmatch_data,data_dict):
'''
according to the attribute corresponding relationship
'''
format_data = unmatch_data
product_data,product_attr_data,product_price_data,product_picture_data = source_data
#please cover your dict
#create attribute dict
attr_list = attribute_rel_data['SubTitle'].unique().tolist()
dic = dict()
for attr in attr_list:
dic[attr] = [v if "," not in str(v) else [v_ for v_ in v.split(",")] for v in attribute_rel_data[attribute_rel_data['SubTitle'] == attr]['Other_SubTitle'].tolist()]
'''
temp_dict = {'标配纸盒容量':['纸盒'],
'产品类型':['product_name'],
'双面器':['双面功能','product_name'],
......@@ -391,42 +451,68 @@ def format_and_fill_data(source_data,unmatch_data,data_dict):
'质保时间':['服务'],
'连续复印':['product_name'],
'首页复印时间':['product_name']}
'''
for std_attr in temp_dict.keys():
format_data[std_attr] = get_multiple_attribute_value(unmatch_data,product_attr_data,data_dict,temp_dict[std_attr],std_attr)
for std_attr in dic.keys():
if std_attr == '产品型号':
continue
del_std_attr = ''
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
del_std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
else:
del_std_attr = std_attr
format_data[del_std_attr] = get_multiple_attribute_value(unmatch_data,product_attr_data,data_dict,dic[std_attr],std_attr)
return format_data
def fill_data_by_crawl(format_data,data_dict,standard_attribute_list):
def fill_data_by_crawl(format_data,data_dict,necessary_attrs,zgc_categoty_name,zgc_category_code):
'''
fill attribute value
'''
zgc_brand_list = format_data['zgc_品牌'].tolist()
zgc_model_list = format_data['completed_产品型号'].tolist()
source_model_list = format_data['产品型号'].tolist()
for std_attr in standard_attribute_list:
for std_attr in necessary_attrs:
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("crawl_%s=[]"%std_attr)
for zgc_brand,zgc_model in zip(zgc_brand_list,zgc_model_list):
for zgc_brand,zgc_model,source_model in zip(zgc_brand_list,zgc_model_list,source_model_list):
if zgc_brand == '未匹配上,请补全并标准化':
print('无品牌信息,无法爬取')
for std_attr in standard_attribute_list:
exec("crawl_%s.append('爬取无数据')"%std_attr)
for std_attr in necessary_attrs:
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("crawl_%s.append('无品牌信息,无法爬取')"%std_attr)
else:
kw = zgc_brand.split('/')[0] + zgc_model
print(kw)
crawl_data_list = crawl_zol(kw,data_dict,standard_attribute_list)
for std_attr,crwal_data in zip(standard_attribute_list,crawl_data_list):
source_model_list =re.findall(r"[A-Za-z0-9-+]+",source_model)
source_model = ' '.join(source_model_list)
kw_list = []
kw_zgc = zgc_brand.split('/')[0] + zgc_model
kw_source = zgc_brand.split('/')[0] + source_model
kw_list.append(kw_zgc)
kw_list.append(kw_source)
print(kw_list)
crawl_data_list = crawl_zol(kw_list,data_dict,necessary_attrs,zgc_categoty_name,zgc_category_code)
for std_attr,crwal_data in zip(necessary_attrs,crawl_data_list):
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("crawl_%s.append('%s')"%(std_attr,crwal_data))
for std_attr in standard_attribute_list:
for std_attr in necessary_attrs:
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("format_data['crawl_%s']=crawl_%s"%(std_attr,std_attr))
for std_attr in standard_attribute_list:
exec("format_data['completed_%s']=[crwal_data if source_data == '无数据' else source_data for source_data,crwal_data in zip(list(format_data['%s']),list(format_data['crawl_%s']))]"%(std_attr,std_attr,std_attr))
for std_attr in necessary_attrs:
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("format_data['completed_%s']=[crwal_data if source_data == '无数据' or source_data == '-' or source_data == ':' or source_data == ':' or source_data == '/' else source_data for source_data,crwal_data in zip(list(format_data['%s']),list(format_data['crawl_%s']))]"%(std_attr,std_attr,std_attr))
'''
zgc_brand_list = format_data['zgc_brand'].tolist()
......@@ -458,11 +544,25 @@ def fill_data_by_crawl(format_data,data_dict,standard_attribute_list):
return format_data
def get_zol_attribute_relation(zgc_category_code):
'''
'''
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute("SELECT * FROM Product_Relation_Attribute_SubTitle where ZI_SubCategoryCode = '"+zgc_category_code+"' and Source = 'ZOL'")
data_source = [v for v in cursor.fetchall()]
zol_rel_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
return zol_rel_data
def crawl_zol(kw,data_dict,standard_attribute_list):
def crawl_zol(kw_list,data_dict,necessary_attrs,zgc_categoty_name,zgc_category_code):
'''
Site:Zol
'''
#kw = '明基EN6850'
#zgc_category_code = '0504'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
......@@ -474,6 +574,12 @@ def crawl_zol(kw,data_dict,standard_attribute_list):
'TE': 'Trailers',
'Cookie':'ip_ck=78CD7v3zj7QuODcyOTc0LjE1NTM1ODc1NjQ%3D; zol_index_today_best_close1=today_yes; zol_userid=weixin_716d9jc1; zol_check=2040718347; zol_cipher=fd5cd1e006683322f25e2b9350b5ad1c; zol_sid=52743385; z_pro_city=s_provice%3Dsichuan%26s_city%3Dchengdu; zol_bind_weixin_716d9jc1=1; gr_user_id=4aedd91b-fbef-43ae-8857-e44d1849bdb3; userProvinceId=17; userCityId=386; userCountyId=0; userLocationId=21; realLocationId=21; userFidLocationId=21; lv=1564041560; vn=6; zol_vest_no=weixin_716d9jc1; z_day=izol106129=1&izol101693=1&rdetail=9; gr_session_id_9b437fe8881a7e19=b304517c-a53c-4945-8f7e-e4c67b4963e7; gr_session_id_9b437fe8881a7e19_b304517c-a53c-4945-8f7e-e4c67b4963e7=true; Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0=1561707760,1562816362,1564019660,1564044365; visited_subcateId=0|212|48|892; visited_subcateProId=0-0|212-0|48-0|892-0; listSubcateId=0; Adshow=0; Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0=1564045129; visited_serachKw=S262NV.html%7CS262NV%7CSF-S262NV%7CSF-S601D%7CFC-5015AC%7CSF-S261NV; questionnaire_pv=1564012830'
}
for kw in kw_list:
kw = quote(kw,encoding='gbk')
#get zol english catrgory name
#get content
url = "http://detail.zol.com.cn/index.php?c=SearchList&keyword=" + kw #东芝2823am
res = requests.get(url, headers=headers)
html = etree.HTML(res.text)
......@@ -483,10 +589,18 @@ def crawl_zol(kw,data_dict,standard_attribute_list):
basic_url = "http://detail.zol.com.cn"
combine_url = basic_url + tag[0]
except:
for std_attr in standard_attribute_list:
combine_url = False
continue
if combine_url == False:
for std_attr in necessary_attrs:
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("crawl_%s='爬取不到数据'"%std_attr)
res = []
for std_attr in standard_attribute_list:
for std_attr in necessary_attrs:
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("res.append(crawl_%s)"%std_attr)
print('抱歉,未找到该产品')
return res
......@@ -505,16 +619,57 @@ def crawl_zol(kw,data_dict,standard_attribute_list):
Zol_data = pd.DataFrame()
attr_list = html.xpath("//span[contains(@id,'newPmName')]/text()")
attr_list = [v.strip() for v in attr_list]
value_list = html.xpath("//span[contains(@id,'newPmVal')]//text()")
value_list = value_list[0:len(attr_list)]
value_list = []
for attr in attr_list:
v = html.xpath("//span[contains(text(),'"+attr+"')]/../following-sibling::td[1]/span//text()")
if isinstance(v,list):
v = ''.join(v)
value_list.append(v)
else:
value_list.append(v)
value_list = [v.strip().replace('\n','').replace('\r','') for v in value_list]
Zol_data['attr'] = attr_list
Zol_data['value'] = value_list
#get zol relationship attribute
zol_rel_data = get_zol_attribute_relation(zgc_category_code)
#create zol relationship attribute dict
dic = dict()
for attr in necessary_attrs:
dic[attr] = [v if "," not in str(v) else [v_ for v_ in v.split(",")] for v in zol_rel_data[zol_rel_data['ZI_SubTitle'] == attr]['Other_SubTitle'].unique().tolist()]
#get need data
get_data_list = []
for std_attr in standard_attribute_list:
for std_attr in necessary_attrs:
get_value = ''
if dic[std_attr]:
for attr in dic[std_attr]:
if isinstance(attr,list):
for attr_child in attr:
for attr_c,value_c in zip(attr_list,value_list):
if attr_child == attr_c:
get_value = value_c
get_data_list.append(get_value)
break
if get_value:
break
if get_value:
break
else:
for attr_c,value_c in zip(attr_list,value_list):
if attr == attr_c:
get_value = value_c
get_data_list.append(get_value)
break
if get_value == '':
get_data_list.append("爬取不到数据")
'''
get_value = ''
for attr,value in zip(attr_list,value_list):
if std_attr in attr or attr in std_attr:
......@@ -533,21 +688,30 @@ def crawl_zol(kw,data_dict,standard_attribute_list):
if get_value == '':
data_dict
get_data_list.append("爬取不到数据")
for std_attr,value in zip(standard_attribute_list,get_data_list):
'''
for std_attr,value in zip(necessary_attrs,get_data_list):
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("crawl_%s='%s'"%(std_attr,value))
res = []
for std_attr in standard_attribute_list:
for std_attr in necessary_attrs:
if '(' in std_attr or '(' in std_attr or '+' in std_attr:
std_attr = std_attr.replace('(','').replace(')','').replace('(','').replace(')','').replace('+','')
exec("res.append(crawl_%s)"%std_attr)
return res
'''
standard_value_by_dict
'''
def transform_by_dict(data_dict,target_data,targer_column):
data_dict['subtitle'] = data_dict['subtitle'].apply(lambda x:x.replace('(','').replace(')','').replace('(','').replace(')','').replace('+',''))
match_v = data_dict[data_dict['subtitle'] == targer_column]['primitive'].apply(lambda x:''.join(x.split(' ')).upper()).tolist()
res = []
......@@ -557,9 +721,11 @@ def transform_by_dict(data_dict,target_data,targer_column):
try:
res.append(data_dict[data_dict['subtitle'] == targer_column]['stdvalue'].tolist()[match_v.index(v)])
except:
res.append('未匹配数据字典')
res.append('未匹配,请添加数据字典')
return res
def standard_value_by_dict(data_dict,format_crawled_data):
df = format_crawled_data[[v for v in format_crawled_data.columns.tolist() if 'complete' in v]]
......@@ -573,17 +739,47 @@ def standard_value_by_dict(data_dict,format_crawled_data):
return format_crawled_data[return_col_list]
source_data = get_source_data('安徽','复印机')
basic_info = get_basic_info(source_data)
match_result = match_data_with_basic_info(basic_info,'复印机')
match_data,unmatch_data = get_match_result(match_result)
channel_category_name = '投影仪'
channel = '河南'
#获取参数项渠道对应关系
rel_data = get_attribute_relation(channel,channel_category_name)
#获取指数库类别名称、类别编号
zgc_categoty_name,zgc_category_code = get_zgc_name_and_code(rel_data)
#获取指数类别产品所有参数项及渠道对应关系
attribute_rel_data = get_attribute_rel_data(zgc_category_code,rel_data)
#获取源数据
source_data = get_source_data(channel,channel_category_name,zgc_categoty_name)
#获取基础品类信息
basic_info = get_basic_info(source_data,attribute_rel_data)
#与指数库内产品匹配并获取匹配结果
match_result = match_data_with_basic_info(basic_info,zgc_category_code)
match_data,unmatch_data = get_match_result(match_result)
match_data.to_excel('match_data.xlsx')
data_dict = get_db_data_dict('复印机')
standard_attribute_list = [v for v in data_dict['subtitle'].apply(lambda x:x.strip()).unique().tolist() if v not in unmatch_data.columns.tolist()]
format_data = format_and_fill_data(source_data,unmatch_data,data_dict)
format_crawled_data = fill_data_by_crawl(format_data,data_dict,standard_attribute_list)
res = standard_value_by_dict(data_dict,format_crawled_data)
#获取数据字典
data_dict = get_db_data_dict(zgc_category_code)
#获取必填参数项
necessary_attrs = attribute_rel_data[attribute_rel_data['ISimportant'] == 1]['SubTitle'].unique().tolist()
necessary_attrs = [v for v in necessary_attrs if v not in unmatch_data.columns.tolist()]
#补全并格式化数据
format_data = format_and_fill_data(attribute_rel_data,source_data,unmatch_data,data_dict)
#爬虫补全必填参数项
format_crawled_data = fill_data_by_crawl(format_data,data_dict,necessary_attrs,zgc_categoty_name,zgc_category_code)
#标准化
res = standard_value_by_dict(data_dict,format_crawled_data)
res.to_excel('res_data.xlsx')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment