Commit 26f3b891 authored by zhou's avatar zhou

Update main_merge.py

parent cf74fcc1
...@@ -47,7 +47,10 @@ def param_extract_function(data_table,channel): ...@@ -47,7 +47,10 @@ def param_extract_function(data_table,channel):
product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','newname']) product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','newname'])
product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','newname'] product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','newname']
product_len = len(product_table['productId']) product_len = len(product_table['productId'])
product_table['batch'] = [now_time for i in range(product_len)] if channel in ['JD', 'SN', 'GM']:
product_table['batch'] = [f'{now_time}1' for i in range(product_len)]
else:
product_table['batch'] = [f'{now_time}3' for i in range(product_len)]
product_table['state'] = ['0' for i in range(product_len)] product_table['state'] = ['0' for i in range(product_len)]
product_table['source'] = [channel for i in range(product_len)] product_table['source'] = [channel for i in range(product_len)]
...@@ -88,7 +91,10 @@ def param_extract_function(data_table,channel): ...@@ -88,7 +91,10 @@ def param_extract_function(data_table,channel):
'subcategorycode':[] 'subcategorycode':[]
} }
m = 0 m = 0
now_time = f'{now_time}1' if channel in ['JD', 'SN', 'GM']:
now_time = f'{now_time}1'
else:
now_time = f'{now_time}3'
for categorycode in category_list: for categorycode in category_list:
sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')") sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')")
param_standard_lyst = [x[0] for x in sql_ZIdatabase.cursor.fetchall()] param_standard_lyst = [x[0] for x in sql_ZIdatabase.cursor.fetchall()]
...@@ -115,15 +121,20 @@ def param_extract_function(data_table,channel): ...@@ -115,15 +121,20 @@ def param_extract_function(data_table,channel):
crawler = CRAWLER(categorycode) crawler = CRAWLER(categorycode)
try: try:
model = LSTMNER(categorycode) model = LSTMNER(categorycode)
for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号']): for SKU,name,params,producttype,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['指数品牌']): #爬取添加品牌
m += 1 m += 1
print(m,end='\r') print(m,end='\r')
product_param_dict = {}#原始参数项对应 product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try: try:
params = eval(params) params = eval(params)
except TypeError: except TypeError:
pass pass
for param_key in params.keys():
for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key: if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号 params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys(): for param_key_2 in params.keys():
...@@ -138,7 +149,11 @@ def param_extract_function(data_table,channel): ...@@ -138,7 +149,11 @@ def param_extract_function(data_table,channel):
name_param_dict = model.param_extract(name)#名称提取参数项 name_param_dict = model.param_extract(name)#名称提取参数项
if categorycode in crawl_category_list: if categorycode in crawl_category_list:
try: try:
crawl_param_dict = crawler.crawl_zol(producttype) if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError: except TypeError:
crawl_param_dict = {} crawl_param_dict = {}
else: else:
...@@ -204,15 +219,19 @@ def param_extract_function(data_table,channel): ...@@ -204,15 +219,19 @@ def param_extract_function(data_table,channel):
model.clean() model.clean()
except FileNotFoundError: except FileNotFoundError:
for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号']): for SKU,name,params,producttype,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['指数品牌']): #爬取添加品牌
m += 1 m += 1
print(m,end='\r') print(m,end='\r')
product_param_dict = {}#原始参数项对应 product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try: try:
params = eval(params) params = eval(params)
except TypeError: except TypeError:
pass pass
for param_key in params.keys(): for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key: if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号 params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys(): for param_key_2 in params.keys():
...@@ -226,7 +245,11 @@ def param_extract_function(data_table,channel): ...@@ -226,7 +245,11 @@ def param_extract_function(data_table,channel):
product_param_dict['产品型号'] = producttype product_param_dict['产品型号'] = producttype
if categorycode in crawl_category_list: if categorycode in crawl_category_list:
try: try:
crawl_param_dict = crawler.crawl_zol(producttype) if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError: except TypeError:
crawl_param_dict = {} crawl_param_dict = {}
else: else:
...@@ -277,7 +300,10 @@ def param_extract_function(data_table,channel): ...@@ -277,7 +300,10 @@ def param_extract_function(data_table,channel):
update_table_df.to_sql('Product_Parameter_Process', sql_ZIdatabase.engine, if_exists='append', index=False) update_table_df.to_sql('Product_Parameter_Process', sql_ZIdatabase.engine, if_exists='append', index=False)
product_table.to_sql('Product_Api_Data', sql_ZIdatabase.engine, if_exists='append', index=False) product_table.to_sql('Product_Api_Data', sql_ZIdatabase.engine, if_exists='append', index=False)
update_sku_list = set(list(product_table['productId'])) update_sku_list = set(list(product_table['productId']))
channel_sql = sql_find(database=f'ZH_{channel}',localhost=False) if channel == '重庆':
channel_sql = sql_find(database=f'chongqing_scrapy_db',localhost=False)
else:
channel_sql = sql_find(database=f'ZH_{channel}',localhost=False)
for sku in update_sku_list: for sku in update_sku_list:
channel_sql.cursor.execute(f"update product_all set state='9' where sku='{sku}' and state='8'") channel_sql.cursor.execute(f"update product_all set state='9' where sku='{sku}' and state='8'")
print('结束!') print('结束!')
...@@ -287,6 +313,7 @@ def return_error(table): ...@@ -287,6 +313,7 @@ def return_error(table):
sql_zi_zh = sql_find('zi_zh', False) sql_zi_zh = sql_find('zi_zh', False)
NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime()) NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
#table = pd.read_excel('api_data/ZH_SN_no_category_data_2019-12-27.xlsx', dtype = {'SKU' : str}) #table = pd.read_excel('api_data/ZH_SN_no_category_data_2019-12-27.xlsx', dtype = {'SKU' : str})
table = table[['SKU','产品名称','url','来源']] table = table[['SKU','产品名称','url','来源']]
l = len(table) l = len(table)
create_time_list = [] create_time_list = []
...@@ -299,6 +326,23 @@ def return_error(table): ...@@ -299,6 +326,23 @@ def return_error(table):
table.columns = ['sku','name','url','source','remark','create_time'] table.columns = ['sku','name','url','source','remark','create_time']
table.to_sql('API_returnErrorData', sql_zi_zh.engine, if_exists='append', index=False) table.to_sql('API_returnErrorData', sql_zi_zh.engine, if_exists='append', index=False)
def return_remark_error(table):
sql_zi_zh = sql_find('zi_zh', False)
NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
#table = pd.read_excel('api_data/ZH_SN_no_category_data_2019-12-27.xlsx', dtype = {'SKU' : str})
table = table[['SKU','产品名称','url','来源','remark']]
l = len(table)
create_time_list = []
for i in range(l):
create_time_list.append(NowTime)
table['create_time'] = create_time_list
table.columns = ['sku','name','url','source','remark','create_time']
table.to_sql('API_returnErrorData', sql_zi_zh.engine, if_exists='append', index=False)
def return_multi_channel(table): def return_multi_channel(table):
sql_zi_zh = sql_find('zi_zh', False) sql_zi_zh = sql_find('zi_zh', False)
...@@ -423,7 +467,7 @@ class data_fetch(): ...@@ -423,7 +467,7 @@ class data_fetch():
if ZI_SubCategory_data[1] is None: if ZI_SubCategory_data[1] is None:
ZI_SubCategoryCode = '没有匹配的指数子类编码' ZI_SubCategoryCode = '没有匹配的指数子类编码'
else: else:
ZI_SubCategoryCode = ZI_SubCategory_data[1] ZI_SubCategoryCode = ZI_SubCategory_data[1].zfill(4)
data.append(ZI_SubCategoryCode) data.append(ZI_SubCategoryCode)
if ZI_SubCategory_data[0] is None: if ZI_SubCategory_data[0] is None:
ZIname = '没有匹配的指数子类编码' ZIname = '没有匹配的指数子类编码'
...@@ -588,7 +632,7 @@ class data_fetch(): ...@@ -588,7 +632,7 @@ class data_fetch():
sql_zi_zh.cursor(f"insert into filestep(period,step,initfilename,filepath) values('{StyleTime}','3','{self.source}_data_category_data_{StyleTime}.xlsx','/root/program/newProductCheck/online_progrom/code/data/{StyleTime}_step3_needs_check')")#存入数据库,页面获取下载。 sql_zi_zh.cursor(f"insert into filestep(period,step,initfilename,filepath) values('{StyleTime}','3','{self.source}_data_category_data_{StyleTime}.xlsx','/root/program/newProductCheck/online_progrom/code/data/{StyleTime}_step3_needs_check')")#存入数据库,页面获取下载。
return data_table return data_table
'''
class myThread_crawl(threading.Thread): class myThread_crawl(threading.Thread):
def __init__(self, channel): def __init__(self, channel):
threading.Thread.__init__(self) threading.Thread.__init__(self)
...@@ -605,6 +649,23 @@ class myThread_crawl(threading.Thread): ...@@ -605,6 +649,23 @@ class myThread_crawl(threading.Thread):
else: else:
param_extract_function_crawl(crawl_table, 'LXWL') param_extract_function_crawl(crawl_table, 'LXWL')
print("退出线程:" + self.channel) print("退出线程:" + self.channel)
'''
class myThread_crawl():
def __init__(self, channel):
self.channel = channel
self.data_get = crawl_data_fetch(channel = self.channel)
print ("开始:" + self.channel)
crawl_table = self.data_get.run()
if isinstance(crawl_table,bool):
pass
else:
if self.channel in ['JD','SN','GM']:
param_extract_function_crawl(crawl_table, self.channel)
else:
param_extract_function_crawl(crawl_table, 'LXWL')
print("退出:" + self.channel)
return None
def param_extract_function_crawl(data_table,channel): def param_extract_function_crawl(data_table,channel):
f_crawl = open('crawl_categorycode.txt', 'r', encoding='utf-8') f_crawl = open('crawl_categorycode.txt', 'r', encoding='utf-8')
...@@ -620,7 +681,7 @@ def param_extract_function_crawl(data_table,channel): ...@@ -620,7 +681,7 @@ def param_extract_function_crawl(data_table,channel):
product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','来源']) product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','来源'])
product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','source'] product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','source']
product_len = len(product_table['productId']) product_len = len(product_table['productId'])
product_table['batch'] = [now_time for i in range(product_len)] product_table['batch'] = [f'{now_time}2' for i in range(product_len)]
product_table['state'] = ['0' for i in range(product_len)] product_table['state'] = ['0' for i in range(product_len)]
if os.path.isfile(f'{channel}_参数对应表_{now_time}.xlsx'): if os.path.isfile(f'{channel}_参数对应表_{now_time}.xlsx'):
...@@ -687,15 +748,20 @@ def param_extract_function_crawl(data_table,channel): ...@@ -687,15 +748,20 @@ def param_extract_function_crawl(data_table,channel):
crawler = CRAWLER(categorycode) crawler = CRAWLER(categorycode)
try: try:
model = LSTMNER(categorycode) model = LSTMNER(categorycode)
for SKU,name,params,producttype,product_channel in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源']): for SKU,name,params,producttype,product_channel,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源'],data_categorycode['指数品牌']):#爬取添加品牌
m += 1 m += 1
print(m,end='\r') print(m,end='\r')
product_param_dict = {}#原始参数项对应 product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try: try:
params = eval(params) params = eval(params)
except TypeError: except TypeError:
pass pass
for param_key in params.keys():
for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key: if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号 params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys(): for param_key_2 in params.keys():
...@@ -710,7 +776,11 @@ def param_extract_function_crawl(data_table,channel): ...@@ -710,7 +776,11 @@ def param_extract_function_crawl(data_table,channel):
name_param_dict = model.param_extract(name)#名称提取参数项 name_param_dict = model.param_extract(name)#名称提取参数项
if categorycode in crawl_category_list: if categorycode in crawl_category_list:
try: try:
crawl_param_dict = crawler.crawl_zol(producttype) if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError: except TypeError:
crawl_param_dict = {} crawl_param_dict = {}
else: else:
...@@ -776,15 +846,19 @@ def param_extract_function_crawl(data_table,channel): ...@@ -776,15 +846,19 @@ def param_extract_function_crawl(data_table,channel):
model.clean() model.clean()
except FileNotFoundError: except FileNotFoundError:
for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源']): for SKU,name,params,producttype,product_channel,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源'],data_categorycode['指数品牌']):#爬取添加品牌
m += 1 m += 1
print(m,end='\r') print(m,end='\r')
product_param_dict = {}#原始参数项对应 product_param_dict = {}#原始参数项对应
if '/' in productbrand:#爬取添加品牌
thebrand = productbrand.split('/')[0]#爬取添加品牌
else:#爬取添加品牌
thebrand = productbrand#爬取添加品牌
try: try:
params = eval(params) params = eval(params)
except TypeError: except TypeError:
pass pass
for param_key in params.keys(): for param_key in list(params.keys()):
if '\t' in param_key or ' ' in param_key: if '\t' in param_key or ' ' in param_key:
params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号 params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key) #去除参数项的特殊符号
for param_key_2 in params.keys(): for param_key_2 in params.keys():
...@@ -798,7 +872,11 @@ def param_extract_function_crawl(data_table,channel): ...@@ -798,7 +872,11 @@ def param_extract_function_crawl(data_table,channel):
product_param_dict['产品型号'] = producttype product_param_dict['产品型号'] = producttype
if categorycode in crawl_category_list: if categorycode in crawl_category_list:
try: try:
crawl_param_dict = crawler.crawl_zol(producttype) if thebrand != '没有对应指数品牌':#爬取添加品牌
producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
else:
crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
except TypeError: except TypeError:
crawl_param_dict = {} crawl_param_dict = {}
else: else:
...@@ -899,16 +977,19 @@ class crawl_data_fetch(): ...@@ -899,16 +977,19 @@ class crawl_data_fetch():
def get_crawl_sku(self): def get_crawl_sku(self):
if self.channel in ['JD','SN','GM']: if self.channel in ['JD','SN','GM']:
self.sql_crawl.cursor.execute(f"select product_id from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and source='{self.channel}'") self.sql_crawl.cursor.execute(f"select product_id,sku from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and source='{self.channel}'")
else: else:
self.sql_crawl.cursor.execute(f"select product_id from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and (source <> 'JD' and source <> 'GM' and source <>'SN')") self.sql_crawl.cursor.execute(f"select product_id,sku from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and (source <> 'JD' and source <> 'GM' and source <>'SN')")
sku_data = self.sql_crawl.cursor.fetchall() sku_data = self.sql_crawl.cursor.fetchall()
product_id_list = []
sku_list = [] sku_list = []
for sku in sku_data: for element in sku_data:
sku = sku[0] product_id = element[0]
sku = element[1]
product_id_list.append(product_id)
sku_list.append(sku) sku_list.append(sku)
sku_set = list(set(sku_list))
return sku_set return product_id_list,sku_list
def get_data_crawl(self, product_id): def get_data_crawl(self, product_id):
self.sql_crawl.cursor.execute(f"select channel_sku,brand,product_name,channel_product_classify,create_time,channel_id,id,channel_product_id from product where product_id='{product_id}'") self.sql_crawl.cursor.execute(f"select channel_sku,brand,product_name,channel_product_classify,create_time,channel_id,id,channel_product_id from product where product_id='{product_id}'")
...@@ -963,11 +1044,16 @@ class crawl_data_fetch(): ...@@ -963,11 +1044,16 @@ class crawl_data_fetch():
return data return data
def run(self): def run(self):
sku_list = self.get_crawl_sku() product_id_list,sku_list = self.get_crawl_sku()
data_dict = {} data_dict = {}
for i in range(len(sku_list)): for i in range(len(sku_list)):
try: try:
data = self.get_data_crawl(sku_list[i]) sku = sku_list[i]
self.sql_crawl.cursor.execute(f"select count(sku) from product_all where sku='{sku}'")
sku_count = self.sql_crawl.cursor.fetchone()[0]
if sku_count > 1:
continue
data = self.get_data_crawl(product_id_list[i])
data_dict[data[0]] = { data_dict[data[0]] = {
'品牌':data[1], '品牌':data[1],
'指数品牌':data[12], '指数品牌':data[12],
...@@ -999,13 +1085,14 @@ class crawl_data_fetch(): ...@@ -999,13 +1085,14 @@ class crawl_data_fetch():
return False return False
def crawl_data_run(): def crawl_data_run():
os.chdir(r'/root/program/newProductCheck/online_progrom/code/API_data')
Get_new() Get_new()
check_and_match()#张楷部分。 check_and_match()#张楷部分。
thread_JD = myThread_crawl('JD') thread_JD = myThread_crawl('JD')
thread_GM = myThread_crawl('GM') thread_GM = myThread_crawl('GM')
thread_SN = myThread_crawl('SN') thread_SN = myThread_crawl('SN')
thread_OTHERS = myThread_crawl('OTHERS') thread_OTHERS = myThread_crawl('OTHERS')
'''
thread_JD.start() thread_JD.start()
thread_GM.start() thread_GM.start()
thread_SN.start() thread_SN.start()
...@@ -1015,7 +1102,7 @@ def crawl_data_run(): ...@@ -1015,7 +1102,7 @@ def crawl_data_run():
thread_GM.join() thread_GM.join()
thread_SN.join() thread_SN.join()
thread_OTHERS.join() thread_OTHERS.join()
'''
if __name__ == '__main__': if __name__ == '__main__':
''' '''
thread_DL = myThread('DL') thread_DL = myThread('DL')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment