Commit b09a5e32 authored by sanlu's avatar sanlu

修正了错误品牌被匹配的问题。

parent 2f0e352a
......@@ -37,9 +37,14 @@ class CRAWLER:
获取ZOL参数对应关系
'''
try:
zol_rel_data = pd.DataFrame()
cursor = self.conn.cursor()
cursor.execute("SELECT * FROM Product_Relation_Attribute_SubTitle where ZI_SubCategoryCode = '"+zgc_category_code+"' and Source = 'ZOL'")
data_source = [v for v in cursor.fetchall()]
try:
data_source = [v for v in cursor.fetchall()]
except:
print(str(zgc_category_code) + " 该类别产品参数项无zol对应关系")
return zol_rel_data
zol_rel_data = pd.DataFrame(data_source,columns=[tuple[0] for tuple in cursor.description])
cursor.close()
......@@ -196,4 +201,4 @@ class CRAWLER:
if __name__ == '__main__':
crawler = CRAWLER('0506')
res = crawler.crawl_zol('东芝2823am')
\ No newline at end of file
res = crawler.crawl_zol('东芝2823am')
......@@ -32,15 +32,21 @@ def jiexi(filename):
product_data_dict[sku][word.strip('[@*]').rsplit('#')[1]] = word.strip('[@*]').rsplit('#')[0]
for word in re.findall(recommendRe,str[1]):
product_data_dict[sku][word.strip('[$*]').rsplit('#')[1]] = word.strip('[$*]').rsplit('#')[0]
str_for_name = str[1].replace('$','@')
standard_name = []
for word in re.findall(entityRe,str_for_name):
standard_name.append(word.strip('[@*]').rsplit('#')[0])
standard_name = ' '.join(standard_name)
product_data_dict[sku]['new_name'] = standard_name
return product_data_dict
def param_upload(filename):
now_time = time.strftime("%Y-%m-%d", time.localtime())
#filename = 'LXWL_2019-12-12_0101.txt.ann'
sql_ZIdatabase = sql_find('ZI_DataBase', False)
source = filename.split('/')[-1].split('.txt')[0].split('_')[0]
batch = filename.split('/')[-1].split('.txt')[0].split('_')[1]
subcategorycode = filename.split('/')[-1].split('.')[0].split('_')[2]
source = filename.split('/')[-1].split('.txt')[0].split('@')[0]
batch = filename.split('/')[-1].split('.txt')[0].split('@')[1]
subcategorycode = filename.split('/')[-1].split('.')[0].split('@')[2]
param_data_dict = jiexi(filename)
update_table = {
'batch':[],
......@@ -57,24 +63,39 @@ def param_upload(filename):
'subcategorycode':[]
}
for productid in param_data_dict.keys():
try:
sql_ZIdatabase.cursor.execute(f"select productName,paramAttributeImportant,paramAttributeMatch,paramAttributeStandard from Product_Parameter_Process where batch='{batch}' and source='{source}' and productId='{productid}'")
product_name_data = sql_ZIdatabase.cursor.fetchone()
if product_name_data == None:
#sql_ZIdatabase.cursor.execute(f"select productName,paramAttributeImportant,paramAttributeMatch,paramAttributeStandard from Product_Api_Data where batch='{batch}' and source='{source}' and productId='{productid}'")
sql_ZIdatabase.cursor.execute(f"select producrName from Product_Api_Data where batch='{batch}' and source='{source}' and productId='{productid}'")
product_name_data = sql_ZIdatabase.cursor.fetchone()
if product_name_data == None:
continue
for productparam in param_data_dict[productid].keys():
if productparam == 'new_name':
sql_ZIdatabase.cursor.execute(f"select brandName from Product_Api_Data where batch='{batch}' and source='{source}' and productId='{productid}'")
brand = sql_ZIdatabase.cursor.fetchone()[0]
if brand != '没有对应指数品牌':
brand = brand.replace('/',' ')
standard_name = ' '.join([brand,param_data_dict[productid][productparam]])
sql_ZIdatabase.cursor.execute(f"update Product_Api_Data set newname = '{standard_name}' where batch='{batch}' and source='{source}' and productId='{productid}'")
else:
sql_ZIdatabase.cursor.execute(f"update Product_Api_Data set newname = '{param_data_dict[productid][productparam]}' where batch='{batch}' and source='{source}' and productId='{productid}'")
continue
for productparam in param_data_dict[productid].keys():
update_table['batch'].append(batch)
update_table['source'].append(source)
update_table['productId'].append(productid)
update_table['productName'].append(product_name_data[0])
update_table['param'].append(productparam)
update_table['paramAttributeImportant'].append(product_name_data[1])
update_table['paramAttributeMatch'].append(product_name_data[2])
update_table['paramAttributeStandard'].append(product_name_data[3])
update_table['value'].append(param_data_dict[productid][productparam])
update_table['paramSource'].append('名称提取参数项')
update_table['state'].append('1')
update_table['subcategorycode'].append(subcategorycode)
sql_ZIdatabase.cursor.execute(f"select ISimportant,ispeijian,ISbiaozhunzhi from VW_Relation_Property where SubCategoryCode='{subcategorycode}' and SubTitle='{productparam}'")
product_param_symbol_data = sql_ZIdatabase.cursor.fetchone()
update_table['batch'].append(batch)
update_table['source'].append(source)
update_table['productId'].append(productid)
update_table['productName'].append(product_name_data[0])
update_table['param'].append(productparam)
update_table['paramAttributeImportant'].append(product_param_symbol_data[0])
update_table['paramAttributeMatch'].append(product_param_symbol_data[1])
update_table['paramAttributeStandard'].append(product_param_symbol_data[2])
update_table['value'].append(param_data_dict[productid][productparam])
update_table['paramSource'].append('模型参数项')
update_table['state'].append('1')
update_table['subcategorycode'].append(subcategorycode)
update_table_df = pd.DataFrame.from_dict(update_table)
#update_table_df.to_excel(f'update_data_{now_time}.xlsx')
update_table_df.to_sql('Product_Parameter_Process', sql_ZIdatabase.engine, if_exists='append', index=False)
\ No newline at end of file
update_table_df.to_excel(f'update_data_{now_time}.xlsx')
update_table_df.to_sql('Product_Parameter_Process', sql_ZIdatabase.engine, if_exists='append', index=False)
if __name__ == '__main__':
print(param_upload('JD@20200103@7914.txt.ann'))
No preview for this file type
......@@ -105,23 +105,47 @@ class Index(object):
return '[%s%s]' % (well_num, space_num)
def brand_table_create():
sql_ZIdatabase = sql_find('ZI_DataBase', False)
sql_ZIdatabase.cursor.execute('select BrandID,BrandName from ZI_BrandList')
brand_table = sql_ZIdatabase.cursor.fetchall()
brand_table = pd.DataFrame(brand_table,columns=[tuple[0] for tuple in sql_ZIdatabase.cursor.description])
chinese_brand_lyst = []
english_brand_lyst = []
for brandname in brand_table['BrandName']:
if '错误品牌' in brandname:
chinese_brand_lyst.append('该条跳过!')
english_brand_lyst.append('该条跳过!')
elif '/' in brandname:
chinese_brand_lyst.append(brandname.split('/')[0])
english_brand_lyst.append(brandname.split('/')[1])
else:
chinese_brand_lyst.append(brandname)
english_brand_lyst.append('该条跳过!')
brand_table['中文品牌'] = chinese_brand_lyst
brand_table['英文品牌'] = english_brand_lyst
return brand_table
class tool():
def __init__(self):
self.peijian_table = pd.read_excel('是否需要配件.xlsx')
self.brand_table = pd.read_excel('品牌对应表.xlsx')
print('生成品牌表中。。。')
self.brand_table = brand_table_create()
print('生成品牌表完成。')
def judge_brand(self, brand, brandcode_original):
if brandcode_original == '没有对应指数品牌':
brandcode_original = str(brandcode_original).zfill(5)[-5:]
#print(brandcode_original)
#print(self.brand_table[self.brand_table['BrandID']==brandcode_original]['BrandName'].tolist())
if brandcode_original == '应指数品牌' or '错误品牌' in self.brand_table[self.brand_table['BrandID']==brandcode_original]['BrandName'].tolist()[0]:
BRANDID = '没有对应指数品牌'
for ID,Chinese_brand,English_brand in zip(self.brand_table['ID'], self.brand_table['中文品牌'], self.brand_table['英文品牌']):
for ID,Chinese_brand,English_brand in zip(self.brand_table['BrandID'], self.brand_table['中文品牌'], self.brand_table['英文品牌']):
if brand == Chinese_brand:
BRANDID = str(ID).zfill(5)
return BRANDID
elif BN(brand) == English_brand:
BRANDID = str(ID).zfill(5)
return BRANDID
else:
BRANDID = str(brandcode_original).zfill(5)[-5:]
BRANDID = brandcode_original
return BRANDID
def judge_peijian(self, data_table):
......@@ -250,8 +274,5 @@ def param_load(product_id, xml_string):
return data_dict
if __name__ == '__main__':
sqlserver = sql_find('ZI_BAK', True)
sqlserver.cursor.execute("select * from ZI_Price_Quote where productcode = '0506003750007'")
print(sqlserver.cursor.fetchall())
a = brand_table_create()
'错误品牌' in a[a['BrandID']=='08358']['BrandName'].tolist()[0]
......@@ -154,7 +154,7 @@ class LSTMNER:
return param_dict
if __name__ == '__main__':
ner = LSTMNER('0507')
ner = LSTMNER('0101')
while 1:
a = input('请输入产品名称:')
#print(ner.class_dict)
......
......@@ -31,12 +31,12 @@ def param_extract_function(data_table,channel):
now_time = time.strftime("%Y-%m-%d", time.localtime())
sql_ZIdatabase = sql_find('ZI_DataBase', False)
product_table = pd.DataFrame(data_api_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','来源'])
product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','source']
product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌'])
product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName']
product_len = len(product_table['productId'])
product_table['batch'] = [now_time for i in range(product_len)]
product_table['state'] = ['0' for i in range(product_len)]
product_table.to_sql('Product_Api_Data', sql_ZIdatabase.engine, if_exists='append', index=False)
product_table['source'] = [channel for i in range(product_len)]
if os.path.isfile(f'{channel}_参数对应表_{now_time}.xlsx'):
merge_data = pd.read_excel(f'{channel}_参数对应表_{now_time}.xlsx', converters={'ZI_SubCategoryCode':str})
......@@ -141,7 +141,7 @@ def param_extract_function(data_table,channel):
update_table['paramAttributeMatch'].append('0')##
update_table['paramAttributeStandard'].append('0')##
update_table['value'].append(name_param_dict[param_key_name])##
update_table['paramSource'].append('名称提取参数项')##
update_table['paramSource'].append('模型参数项')##
update_table['state'].append('0')##
update_table['subcategorycode'].append(categorycode)#
......@@ -254,6 +254,7 @@ def param_extract_function(data_table,channel):
update_table_df = pd.DataFrame.from_dict(update_table)
update_table_df.to_excel('update_data_test.xlsx')
update_table_df.to_sql('Product_Parameter_Process', sql_ZIdatabase.engine, if_exists='append', index=False)
product_table.to_sql('Product_Api_Data', sql_ZIdatabase.engine, if_exists='append', index=False)
return update_table_df
class data_fetch():
......@@ -276,6 +277,7 @@ class data_fetch():
self.sql_ZIdatabase_local = sql_find('ZI_DataBase', localhost)
self.model = model
self.source = source
self.channel = source.split('_')[1]
self.index = Index()
def find_brandcode(self, normalbrand, english_brand):
......@@ -298,17 +300,25 @@ class data_fetch():
if brandcode != '没有对应指数品牌':
return brandcode
brandcode = self.find_brandcode(normalbrand, english_brand)
brandcode = self.tool.judge_brand(brand, brandcode)
brandcode = self.tool.judge_brand(normalbrand, brandcode)
if brandcode != '没有对应指数品牌':
return brandcode
brandcode = self.tool.judge_brand(english_brand, brandcode)
return brandcode
def get_LXWL_sku(self):
self.sql_LXWL.cursor.execute("select sku from product_all where productcode is null")
try:
self.sql_LXWL.cursor.execute("select sku from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null)")
except pymssql.ProgrammingError:
self.sql_LXWL.cursor.execute("select sku from product_all where productcode is null and state='1'")
sku_data = self.sql_LXWL.cursor.fetchall()
sku_list = []
#print(len(sku_data))
for sku in sku_data:
sku = sku[0]
sku_list.append(sku)
sku_set = list(set(sku_list))
#print(len(sku_set))
return sku_set
def get_JD_sku(self):
......@@ -330,11 +340,15 @@ class data_fetch():
symbol = 0
#sql_ZIdatabase.cursor.execute(f'select * from Product_Relation_Attribute_SubTitle where SourceSubCategory={data[3]}')
self.sql_ZIdatabase_local.cursor.execute(f"select ZI_SubCategoryCode,ZI_SubTitle from Product_Relation_Attribute_SubTitle where SourceSubCategory='{data[3]}'")
self.sql_ZIdatabase_local.cursor.execute(f"select ZI_SubCategoryCode,ZI_SubTitle from Product_Relation_Attribute_SubTitle where SourceSubCategory='{data[3]}' and Source='{self.channel}'")
try:
ZI_SubCategoryCode = self.sql_ZIdatabase_local.cursor.fetchone()[0]
except TypeError:
ZI_SubCategoryCode = '没有匹配的指数子类编码'
try:
self.sql_ZIdatabase_local.cursor.execute(f"select ZI_SubCategoryCode,ZI_SubTitle from Product_Relation_Attribute_SubTitle where SourceSubCategory='{data[3]}'")
ZI_SubCategoryCode = self.sql_ZIdatabase_local.cursor.fetchone()[0]
except TypeError:
ZI_SubCategoryCode = '没有匹配的指数子类编码'
symbol = 1
data.append(ZI_SubCategoryCode)
......@@ -439,6 +453,7 @@ class data_fetch():
def run(self):
if self.model == 'LXWL_model':
sku_list = self.get_LXWL_sku()
#sku_list = ['1486456']
elif self.model == 'JD_model':
sku_list = self.get_JD_sku()
data_dict = {}
......@@ -465,11 +480,12 @@ class data_fetch():
#'匹配的productcode':data[12],
'来源':f'{self.source}'
}
except:
except OSError:
print(f'{sku_list[i]}出错。')
continue
try:
print(self.index(i, len(sku_list)-1), end=f'% 共{len(sku_list)}款产品,目前第{i+1}款。')
#pass
except:
pass
#print(data_dict)
......@@ -492,7 +508,7 @@ class data_fetch():
return data_table
if __name__ == '__main__':
channel = 'LXWL'
channel = 'OFS'
data_api = data_fetch(model='LXWL_model', localhost=False, source=f'ZH_{channel}')
data_api_table = data_api.run()
param_extract_function(data_api_table,channel)
\ No newline at end of file
#param_extract_function(data_api_table,channel)
\ No newline at end of file
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 25 13:58:07 2019
@author: rico
"""
import pandas as pd
import pymssql
import os
from collections import Counter
import datetime
'''
test data load
'''
def get_test_data():
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
#cursor.execute("select * from vw_electricity_source_price where ProductCode in "+ code_list +" and periods = (select top 1 periods from vw_entry_source_price order by periods desc)")
cursor.execute("select * from Product_Api_Data where batch = '20191224' and source = 'JD' and state = 4")
test_df = pd.DataFrame([v for v in cursor.fetchall()],columns=[tuple[0] for tuple in cursor.description])
cursor.close()
conn.close()
return test_df
#df = get_test_data()
def get_params_df(df):
#get productid and batch and source
productid_list = df['productId'].unique().tolist()
productid_list_str = str(productid_list).replace('[','(').replace(']',')')
batch = df['batch'][0]
source = df['source'][0]
#search
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
cursor.execute(f"select * from Product_Parameter_Process where productId in {productid_list_str} and batch = '{batch}' and source = '{source}' and state = 2 and paramAttributeMatch = 1")
params_df = pd.DataFrame([v for v in cursor.fetchall()],columns=[tuple[0] for tuple in cursor.description])
#comoleted brand info
params_df = pd.merge(params_df,df[['productId','brandName','brandCode']],on = 'productId')
cursor.close()
conn.close()
return params_df
def get_db_data(zgc_category_code):
try:
path = os.getcwd() #获取当前工作目录路径
file_list = [c for a,b,c in os.walk(path)][0]
date = datetime.datetime.now().strftime('%Y-%m-%d')
db_file_name = zgc_category_code +'_'+date+'_db_data.csv'
file_path = path+'/'+db_file_name
#删除过期文件
for file in [file for file in file_list if '_db_data.csv' in file]:
if date != file.split('_')[1]:
print('删除过期文件')
os.remove(path+'/'+file)
if db_file_name in file_list:
print('获取本地已存数据库数据')
db_attr_data = pd.read_csv(file_path,converters={'productcode':str,'SubCategoryCode':str,'BrandCode':str})
db_attr_data = db_attr_data.rename(columns={"productcode": "ProductCode","productname":"ProductName","subtitle":"SubTitle","state":"State","value":"Value"})
return db_attr_data
conn = pymssql.connect('123.56.115.207','zgcindex','jiayou2017+2018','ZI_DataBase')
cursor = conn.cursor()
print('获取最新线上数据库数据')
cursor.execute(f"select ProductCode,CfgID,Value from Info_ProductDetail where LEFT(ProductCode,4) = {zgc_category_code}")
params = cursor.fetchall()
params= pd.DataFrame(params, columns=[tuple[0] for tuple in cursor.description])
CfgID_list = params['CfgID'].apply(lambda x:str(x)).unique().tolist()
CfgID_list_string = "','".join(CfgID_list)
cursor.execute(f"select SubTitleID,SubTitle from VW_Relation_Property where SubTitleID in ('{CfgID_list_string}')")
CfgID_name = cursor.fetchall()
CfgID_name = pd.DataFrame(CfgID_name, columns=[tuple[0] for tuple in cursor.description])
cursor.execute(f"select ProductCode,ProductName,State,RIGHT(BrandCode,5) as BrandCode from Info_Product where SubCategoryCode = {zgc_category_code}")
pro_info = cursor.fetchall()
pro_info = pd.DataFrame(pro_info, columns=[tuple[0] for tuple in cursor.description])
res = pd.merge(params,CfgID_name,left_on = 'CfgID',right_on = 'SubTitleID')
res = pd.merge(res,pro_info,on = 'ProductCode')
res.to_csv(file_path)
cursor.close()
conn.close()
return res
except OSError:
print('链接失败,重新链接')
return get_db_data(zgc_category_code)
def counter(arr):
#count list element frequency
return Counter(arr).most_common(len(list(set(arr))))
def match_with_db(params_df):
#return data
match_res = pd.DataFrame()
match_res['productId'] = params_df['productId'].unique().tolist()
match_res['productName'] = [list(params_df[params_df['productId'] == _id]['productName'])[0] for _id in list(match_res['productId'])]
match_res['source'] = [list(params_df[params_df['productId'] == _id]['source'])[0] for _id in list(match_res['productId'])]
match_res['batch'] = [list(params_df[params_df['productId'] == _id]['batch'])[0] for _id in list(match_res['productId'])]
match_code_list = []
match_name_list = []
#classify data
category_list = params_df['subcategorycode'].unique().tolist()
for category_code in category_list:
print('开始匹配'+category_code+'数据')
#category_code = '0101'
single_df = params_df[params_df['subcategorycode'] == category_code]
#get brand list
brand_list = single_df['brandCode'].unique().tolist()
#get productid
productid_list = single_df['productId'].unique().tolist()
#get db data
db_attr_data = get_db_data(category_code)
db_attr_data = db_attr_data[db_attr_data['BrandCode'].isin(brand_list)]
#match
for productid in productid_list:
#productid = '46262229631'
matched_productcode_list = []
matched_productname_list = []
match_flag = 1
df = single_df[single_df['productId'] == productid]
df_db = db_attr_data[db_attr_data['BrandCode'] == list(df['brandCode'])[0]]
combine_code_list = []
for param,value in zip(list(df['param']),list(df['value'])):
productcode_list = df_db[(df_db['SubTitle'] == param) & (df_db['Value'] == value)]['ProductCode'].unique().tolist()
if len(productcode_list) == 0:
match_flag = 0
break
combine_code_list += productcode_list
if match_flag == 1:
match_flag = 0
for tuple_ in counter(combine_code_list):
if tuple_[1] == len(df['param']):
match_flag = 1
matched_productcode_list.append(tuple_[0])
matched_productname_list.append(list(df_db[df_db['ProductCode'] == tuple_[0]]['ProductName'])[0])
match_code_list.append(matched_productcode_list)
match_name_list.append(matched_productname_list)
match_res['match_code'] = match_code_list
match_res['match_name'] = match_name_list
return match_res
def match(df):
#enter
params_df = get_params_df(df)
res = match_with_db(params_df)
return res
def find_brandcode(self, normalbrand, english_brand):
brand_find = self.brand_table[self.brand_table['标准化品牌']==normalbrand]['BrandCode'].tolist()
if len(brand_find) == 0:
return '没有对应指数品牌'
if len(brand_find) == 1:
return brand_find[0]
else:
brand_find = self.brand_table[self.brand_table['标准化品牌']==english_brand]['BrandCode'].tolist()
if len(brand_find) == 0:
return '没有对应指数品牌'
else:
return brand_find[0]
def brandcode_search(self, brand):
normalbrand = BN(brand)
english_brand = re.sub(r'[^A-Za-z]', '', brand).upper()
brandcode = self.find_brandcode(brand.upper(), 'XXXXXXXXXXXXX')
if brandcode != '没有对应指数品牌':
return brandcode
brandcode = self.find_brandcode(normalbrand, english_brand)
brandcode = self.tool.judge_brand(normalbrand, brandcode)
if brandcode != '没有对应指数品牌':
return brandcode
brandcode = self.tool.judge_brand(english_brand, brandcode)
return brandcode
\ No newline at end of file
No preview for this file type
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment