Commit 26f3b891 authored by zhou's avatar zhou

Update main_merge.py

parent cf74fcc1
......@@ -47,7 +47,10 @@ def param_extract_function(data_table,channel):
product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','newname'])
product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','newname']
product_len = len(product_table['productId'])
product_table['batch'] = [now_time for i in range(product_len)]
if channel in ['JD', 'SN', 'GM']:
product_table['batch'] = [f'{now_time}1' for i in range(product_len)]
else:
product_table['batch'] = [f'{now_time}3' for i in range(product_len)]
product_table['state'] = ['0' for i in range(product_len)]
product_table['source'] = [channel for i in range(product_len)]
......@@ -88,7 +91,10 @@ def param_extract_function(data_table,channel):
'subcategorycode':[]
}
m = 0
if channel in ['JD', 'SN', 'GM']:
now_time = f'{now_time}1'
else:
now_time = f'{now_time}3'
for categorycode in category_list:
sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')")
param_standard_lyst = [x[0] for x in sql_ZIdatabase.cursor.fetchall()]
......@@ -115,15 +121,20 @@ def param_extract_function(data_table,channel):
crawler = CRAWLER(categorycode)
try:
model = LSTMNER(categorycode)
for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号']):
for SKU,name,params,producttype,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['指数品牌']): #爬取添加品牌
m += 1
print(m,end='\r')
product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try:
params = eval(params)
except TypeError:
pass
for param_key in params.keys():
for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys():
......@@ -138,7 +149,11 @@ def param_extract_function(data_table,channel):
name_param_dict = model.param_extract(name)#名称提取参数项
if categorycode in crawl_category_list:
try:
crawl_param_dict = crawler.crawl_zol(producttype)
if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError:
crawl_param_dict = {}
else:
......@@ -204,15 +219,19 @@ def param_extract_function(data_table,channel):
model.clean()
except FileNotFoundError:
for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号']):
for SKU,name,params,producttype,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['指数品牌']): #爬取添加品牌
m += 1
print(m,end='\r')
product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try:
params = eval(params)
except TypeError:
pass
for param_key in params.keys():
for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys():
......@@ -226,7 +245,11 @@ def param_extract_function(data_table,channel):
product_param_dict['产品型号'] = producttype
if categorycode in crawl_category_list:
try:
crawl_param_dict = crawler.crawl_zol(producttype)
if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError:
crawl_param_dict = {}
else:
......@@ -277,6 +300,9 @@ def param_extract_function(data_table,channel):
update_table_df.to_sql('Product_Parameter_Process', sql_ZIdatabase.engine, if_exists='append', index=False)
product_table.to_sql('Product_Api_Data', sql_ZIdatabase.engine, if_exists='append', index=False)
update_sku_list = set(list(product_table['productId']))
if channel == '重庆':
channel_sql = sql_find(database=f'chongqing_scrapy_db',localhost=False)
else:
channel_sql = sql_find(database=f'ZH_{channel}',localhost=False)
for sku in update_sku_list:
channel_sql.cursor.execute(f"update product_all set state='9' where sku='{sku}' and state='8'")
......@@ -287,6 +313,7 @@ def return_error(table):
sql_zi_zh = sql_find('zi_zh', False)
NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
#table = pd.read_excel('api_data/ZH_SN_no_category_data_2019-12-27.xlsx', dtype = {'SKU' : str})
table = table[['SKU','产品名称','url','来源']]
l = len(table)
create_time_list = []
......@@ -300,6 +327,23 @@ def return_error(table):
table.columns = ['sku','name','url','source','remark','create_time']
table.to_sql('API_returnErrorData', sql_zi_zh.engine, if_exists='append', index=False)
def return_remark_error(table):
sql_zi_zh = sql_find('zi_zh', False)
NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
#table = pd.read_excel('api_data/ZH_SN_no_category_data_2019-12-27.xlsx', dtype = {'SKU' : str})
table = table[['SKU','产品名称','url','来源','remark']]
l = len(table)
create_time_list = []
for i in range(l):
create_time_list.append(NowTime)
table['create_time'] = create_time_list
table.columns = ['sku','name','url','source','remark','create_time']
table.to_sql('API_returnErrorData', sql_zi_zh.engine, if_exists='append', index=False)
def return_multi_channel(table):
sql_zi_zh = sql_find('zi_zh', False)
NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
......@@ -423,7 +467,7 @@ class data_fetch():
if ZI_SubCategory_data[1] is None:
ZI_SubCategoryCode = '没有匹配的指数子类编码'
else:
ZI_SubCategoryCode = ZI_SubCategory_data[1]
ZI_SubCategoryCode = ZI_SubCategory_data[1].zfill(4)
data.append(ZI_SubCategoryCode)
if ZI_SubCategory_data[0] is None:
ZIname = '没有匹配的指数子类编码'
......@@ -588,7 +632,7 @@ class data_fetch():
sql_zi_zh.cursor(f"insert into filestep(period,step,initfilename,filepath) values('{StyleTime}','3','{self.source}_data_category_data_{StyleTime}.xlsx','/root/program/newProductCheck/online_progrom/code/data/{StyleTime}_step3_needs_check')")#存入数据库,页面获取下载。
return data_table
'''
class myThread_crawl(threading.Thread):
def __init__(self, channel):
threading.Thread.__init__(self)
......@@ -605,6 +649,23 @@ class myThread_crawl(threading.Thread):
else:
param_extract_function_crawl(crawl_table, 'LXWL')
print("退出线程:" + self.channel)
'''
class myThread_crawl():
def __init__(self, channel):
self.channel = channel
self.data_get = crawl_data_fetch(channel = self.channel)
print ("开始:" + self.channel)
crawl_table = self.data_get.run()
if isinstance(crawl_table,bool):
pass
else:
if self.channel in ['JD','SN','GM']:
param_extract_function_crawl(crawl_table, self.channel)
else:
param_extract_function_crawl(crawl_table, 'LXWL')
print("退出:" + self.channel)
return None
def param_extract_function_crawl(data_table,channel):
f_crawl = open('crawl_categorycode.txt', 'r', encoding='utf-8')
......@@ -620,7 +681,7 @@ def param_extract_function_crawl(data_table,channel):
product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','来源'])
product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','source']
product_len = len(product_table['productId'])
product_table['batch'] = [now_time for i in range(product_len)]
product_table['batch'] = [f'{now_time}2' for i in range(product_len)]
product_table['state'] = ['0' for i in range(product_len)]
if os.path.isfile(f'{channel}_参数对应表_{now_time}.xlsx'):
......@@ -687,15 +748,20 @@ def param_extract_function_crawl(data_table,channel):
crawler = CRAWLER(categorycode)
try:
model = LSTMNER(categorycode)
for SKU,name,params,producttype,product_channel in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源']):
for SKU,name,params,producttype,product_channel,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源'],data_categorycode['指数品牌']):#爬取添加品牌
m += 1
print(m,end='\r')
product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try:
params = eval(params)
except TypeError:
pass
for param_key in params.keys():
for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys():
......@@ -710,7 +776,11 @@ def param_extract_function_crawl(data_table,channel):
name_param_dict = model.param_extract(name)#名称提取参数项
if categorycode in crawl_category_list:
try:
crawl_param_dict = crawler.crawl_zol(producttype)
if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError:
crawl_param_dict = {}
else:
......@@ -776,15 +846,19 @@ def param_extract_function_crawl(data_table,channel):
model.clean()
except FileNotFoundError:
for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源']):
for SKU,name,params,producttype,product_channel,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源'],data_categorycode['指数品牌']):#爬取添加品牌
m += 1
print(m,end='\r')
product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try:
params = eval(params)
except TypeError:
pass
for param_key in params.keys():
for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys():
......@@ -798,7 +872,11 @@ def param_extract_function_crawl(data_table,channel):
product_param_dict['产品型号'] = producttype
if categorycode in crawl_category_list:
try:
crawl_param_dict = crawler.crawl_zol(producttype)
if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError:
crawl_param_dict = {}
else:
......@@ -899,16 +977,19 @@ class crawl_data_fetch():
def get_crawl_sku(self):
if self.channel in ['JD','SN','GM']:
self.sql_crawl.cursor.execute(f"select product_id from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and source='{self.channel}'")
self.sql_crawl.cursor.execute(f"select product_id,sku from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and source='{self.channel}'")
else:
self.sql_crawl.cursor.execute(f"select product_id from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and (source <> 'JD' and source <> 'GM' and source <>'SN')")
self.sql_crawl.cursor.execute(f"select product_id,sku from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and (source <> 'JD' and source <> 'GM' and source <>'SN')")
sku_data = self.sql_crawl.cursor.fetchall()
product_id_list = []
sku_list = []
for sku in sku_data:
sku = sku[0]
for element in sku_data:
product_id = element[0]
sku = element[1]
product_id_list.append(product_id)
sku_list.append(sku)
sku_set = list(set(sku_list))
return sku_set
return product_id_list,sku_list
def get_data_crawl(self, product_id):
self.sql_crawl.cursor.execute(f"select channel_sku,brand,product_name,channel_product_classify,create_time,channel_id,id,channel_product_id from product where product_id='{product_id}'")
......@@ -963,11 +1044,16 @@ class crawl_data_fetch():
return data
def run(self):
sku_list = self.get_crawl_sku()
product_id_list,sku_list = self.get_crawl_sku()
data_dict = {}
for i in range(len(sku_list)):
try:
data = self.get_data_crawl(sku_list[i])
sku = sku_list[i]
self.sql_crawl.cursor.execute(f"select count(sku) from product_all where sku='{sku}'")
sku_count = self.sql_crawl.cursor.fetchone()[0]
if sku_count > 1:
continue
data = self.get_data_crawl(product_id_list[i])
data_dict[data[0]] = {
'品牌':data[1],
'指数品牌':data[12],
......@@ -999,13 +1085,14 @@ class crawl_data_fetch():
return False
def crawl_data_run():
os.chdir(r'/root/program/newProductCheck/online_progrom/code/API_data')
Get_new()
check_and_match()#张楷部分。
thread_JD = myThread_crawl('JD')
thread_GM = myThread_crawl('GM')
thread_SN = myThread_crawl('SN')
thread_OTHERS = myThread_crawl('OTHERS')
'''
thread_JD.start()
thread_GM.start()
thread_SN.start()
......@@ -1015,7 +1102,7 @@ def crawl_data_run():
thread_GM.join()
thread_SN.join()
thread_OTHERS.join()
'''
if __name__ == '__main__':
'''
thread_DL = myThread('DL')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment