Update main_merge.py

26f3b891 · zhou · cf74fcc1 · 26f3b891
Commit 26f3b891 authored Mar 18, 2020 by zhou
Show whitespace changes
Inline Side-by-side

Showing with 115 additions and 28 deletions

main_merge.py main_merge.py +115 -28

No files found.
--- a/main_merge.py
+++ b/main_merge.py
@@ -47,7 +47,10 @@ def param_extract_function(data_table,channel):
    product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','newname'])
    product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','newname']
    product_len = len(product_table['productId'])
-    product_table['batch'] = [now_time for i in range(product_len)]
+    if channel in ['JD', 'SN', 'GM']:
+        product_table['batch'] = [f'{now_time}1' for i in range(product_len)]
+    else:
+        product_table['batch'] = [f'{now_time}3' for i in range(product_len)]
    product_table['state'] = ['0' for i in range(product_len)]
    product_table['source'] = [channel for i in range(product_len)]

@@ -88,7 +91,10 @@ def param_extract_function(data_table,channel):
                    'subcategorycode':[]
                    }
    m = 0
+    if channel in ['JD', 'SN', 'GM']:
        now_time = f'{now_time}1'
+    else:
+        now_time = f'{now_time}3'
    for categorycode in category_list:
        sql_ZIdatabase.cursor.execute(f"select SubTitle from VW_Relation_Property where SubCategoryCode='{categorycode}' and (Isimportant = '1' or Ispeijian = '1')")
        param_standard_lyst = [x[0] for x in sql_ZIdatabase.cursor.fetchall()]
@@ -115,15 +121,20 @@ def param_extract_function(data_table,channel):
        crawler = CRAWLER(categorycode)
        try:
            model = LSTMNER(categorycode)
-            for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号']):
+            for SKU,name,params,producttype,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['指数品牌']):  #爬取添加品牌
                m += 1
                print(m,end='\r')
                product_param_dict = {}#原始参数项对应
+                if '/' in productbrand:#爬取添加品牌
+                    thebrand = productbrand.split('/')[0]#爬取添加品牌
+                else:#爬取添加品牌
+                    thebrand = productbrand#爬取添加品牌
                try:
                    params = eval(params)
                except TypeError:
                    pass
-                for param_key in params.keys():
+                
+                for param_key in list(params.keys()):
                    if '\t' in param_key or ' ' in param_key:
                        params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key)        #去除参数项的特殊符号
                for param_key_2 in params.keys():
@@ -138,7 +149,11 @@ def param_extract_function(data_table,channel):
                name_param_dict = model.param_extract(name)#名称提取参数项
                if categorycode in crawl_category_list:
                    try:
-                        crawl_param_dict = crawler.crawl_zol(producttype)
+                        if thebrand != '没有对应指数品牌':#爬取添加品牌
+                            producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
+                            crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
+                        else:
+                            crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
                    except TypeError:
                        crawl_param_dict = {}
                else:
@@ -204,15 +219,19 @@ def param_extract_function(data_table,channel):
            model.clean()

        except FileNotFoundError:
-            for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号']):
+            for SKU,name,params,producttype,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['指数品牌']):     #爬取添加品牌
                m += 1
                print(m,end='\r')
                product_param_dict = {}#原始参数项对应
+                if '/' in productbrand:#爬取添加品牌
+                    thebrand = productbrand.split('/')[0]#爬取添加品牌
+                else:#爬取添加品牌
+                    thebrand = productbrand#爬取添加品牌
                try:
                    params = eval(params)
                except TypeError:
                    pass
-                for param_key in params.keys():
+                for param_key in list(params.keys()):
                    if '\t' in param_key or ' ' in param_key:
                        params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key)        #去除参数项的特殊符号
                for param_key_2 in params.keys():
@@ -226,7 +245,11 @@ def param_extract_function(data_table,channel):
                    product_param_dict['产品型号'] = producttype
                if categorycode in crawl_category_list:
                    try:
-                        crawl_param_dict = crawler.crawl_zol(producttype)
+                        if thebrand != '没有对应指数品牌':#爬取添加品牌
+                            producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
+                            crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
+                        else:
+                            crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
                    except TypeError:
                        crawl_param_dict = {}
                else:
@@ -277,6 +300,9 @@ def param_extract_function(data_table,channel):
    update_table_df.to_sql('Product_Parameter_Process', sql_ZIdatabase.engine, if_exists='append', index=False)
    product_table.to_sql('Product_Api_Data', sql_ZIdatabase.engine, if_exists='append', index=False)
    update_sku_list = set(list(product_table['productId']))
+    if channel == '重庆':
+        channel_sql = sql_find(database=f'chongqing_scrapy_db',localhost=False)
+    else:
        channel_sql = sql_find(database=f'ZH_{channel}',localhost=False)
    for sku in update_sku_list:
        channel_sql.cursor.execute(f"update product_all set state='9' where sku='{sku}' and state='8'")
@@ -287,6 +313,7 @@ def return_error(table):
    sql_zi_zh = sql_find('zi_zh', False)
    NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
    #table = pd.read_excel('api_data/ZH_SN_no_category_data_2019-12-27.xlsx', dtype = {'SKU' : str})
+    
    table = table[['SKU','产品名称','url','来源']]
    l = len(table)
    create_time_list = []
@@ -300,6 +327,23 @@ def return_error(table):
    table.columns = ['sku','name','url','source','remark','create_time']
    table.to_sql('API_returnErrorData', sql_zi_zh.engine, if_exists='append', index=False)
    
+def return_remark_error(table):
+    sql_zi_zh = sql_find('zi_zh', False)
+    NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
+    #table = pd.read_excel('api_data/ZH_SN_no_category_data_2019-12-27.xlsx', dtype = {'SKU' : str})
+    
+    table = table[['SKU','产品名称','url','来源','remark']]
+    l = len(table)
+    create_time_list = []
+
+    for i in range(l):
+        create_time_list.append(NowTime)
+
+    table['create_time'] = create_time_list
+
+    table.columns = ['sku','name','url','source','remark','create_time']
+    table.to_sql('API_returnErrorData', sql_zi_zh.engine, if_exists='append', index=False)
+
 def return_multi_channel(table):
    sql_zi_zh = sql_find('zi_zh', False)
    NowTime = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime())
@@ -423,7 +467,7 @@ class data_fetch():
        if ZI_SubCategory_data[1] is None:
            ZI_SubCategoryCode = '没有匹配的指数子类编码'
        else:
-            ZI_SubCategoryCode = ZI_SubCategory_data[1]
+            ZI_SubCategoryCode = ZI_SubCategory_data[1].zfill(4)
        data.append(ZI_SubCategoryCode)
        if ZI_SubCategory_data[0] is None:
            ZIname = '没有匹配的指数子类编码'
@@ -588,7 +632,7 @@ class data_fetch():
        sql_zi_zh.cursor(f"insert into filestep(period,step,initfilename,filepath) values('{StyleTime}','3','{self.source}_data_category_data_{StyleTime}.xlsx','/root/program/newProductCheck/online_progrom/code/data/{StyleTime}_step3_needs_check')")#存入数据库，页面获取下载。
        
        return data_table
-
+'''
 class myThread_crawl(threading.Thread):
    def __init__(self, channel):
        threading.Thread.__init__(self)
@@ -605,6 +649,23 @@ class myThread_crawl(threading.Thread):
            else:
                param_extract_function_crawl(crawl_table, 'LXWL')
        print("退出线程：" + self.channel)
+'''
+
+class myThread_crawl():
+    def __init__(self, channel):
+        self.channel = channel
+        self.data_get = crawl_data_fetch(channel = self.channel)
+        print ("开始：" + self.channel)
+        crawl_table = self.data_get.run()
+        if isinstance(crawl_table,bool):
+            pass
+        else:
+            if self.channel in ['JD','SN','GM']:
+                param_extract_function_crawl(crawl_table, self.channel)
+            else:
+                param_extract_function_crawl(crawl_table, 'LXWL')
+        print("退出：" + self.channel)
+        return None

 def param_extract_function_crawl(data_table,channel):
    f_crawl = open('crawl_categorycode.txt', 'r', encoding='utf-8')
@@ -620,7 +681,7 @@ def param_extract_function_crawl(data_table,channel):
    product_table = pd.DataFrame(data_table, columns=['SKU', '指数品牌','指数品牌编码','产品名称','指数子类','指数子类编码','产品价格','url','品牌','来源'])
    product_table.columns = ['productId', 'brandName','brandCode','producrName','categoryCodeName','categoryCode','price','url','original_brandName','source']
    product_len = len(product_table['productId'])
-    product_table['batch'] = [now_time for i in range(product_len)]
+    product_table['batch'] = [f'{now_time}2' for i in range(product_len)]
    product_table['state'] = ['0' for i in range(product_len)]

    if os.path.isfile(f'{channel}_参数对应表_{now_time}.xlsx'):
@@ -687,15 +748,20 @@ def param_extract_function_crawl(data_table,channel):
        crawler = CRAWLER(categorycode)
        try:
            model = LSTMNER(categorycode)
-            for SKU,name,params,producttype,product_channel in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源']):
+            for SKU,name,params,producttype,product_channel,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源'],data_categorycode['指数品牌']):#爬取添加品牌
                m += 1
                print(m,end='\r')
                product_param_dict = {}#原始参数项对应
+                if '/' in productbrand:#爬取添加品牌
+                    thebrand = productbrand.split('/')[0]#爬取添加品牌
+                else:#爬取添加品牌
+                    thebrand = productbrand#爬取添加品牌
                try:
                    params = eval(params)
                except TypeError:
                    pass
-                for param_key in params.keys():
+
+                for param_key in list(params.keys()):
                    if '\t' in param_key or ' ' in param_key:
                        params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key)        #去除参数项的特殊符号
                for param_key_2 in params.keys():
@@ -710,7 +776,11 @@ def param_extract_function_crawl(data_table,channel):
                name_param_dict = model.param_extract(name)#名称提取参数项
                if categorycode in crawl_category_list:
                    try:
-                        crawl_param_dict = crawler.crawl_zol(producttype)
+                        if thebrand != '没有对应指数品牌':#爬取添加品牌
+                            producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
+                            crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
+                        else:
+                            crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
                    except TypeError:
                        crawl_param_dict = {}
                else:
@@ -776,15 +846,19 @@ def param_extract_function_crawl(data_table,channel):
            model.clean()

        except FileNotFoundError:
-            for SKU,name,params,producttype in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源']):
+            for SKU,name,params,producttype,product_channel,productbrand in zip(data_categorycode['SKU'],data_categorycode['产品名称'],data_categorycode['参数项'],data_categorycode['产品型号'],data_categorycode['来源'],data_categorycode['指数品牌']):#爬取添加品牌
                m += 1
                print(m,end='\r')
                product_param_dict = {}#原始参数项对应
+                if '/' in productbrand:#爬取添加品牌
+                    thebrand = productbrand.split('/')[0]#爬取添加品牌
+                else:#爬取添加品牌
+                    thebrand = productbrand#爬取添加品牌
                try:
                    params = eval(params)
                except TypeError:
                    pass
-                for param_key in params.keys():
+                for param_key in list(params.keys()):
                    if '\t' in param_key or ' ' in param_key:
                        params[param_key.replace('\t','').replace(' ','')] = params.pop(param_key)        #去除参数项的特殊符号
                for param_key_2 in params.keys():
@@ -798,7 +872,11 @@ def param_extract_function_crawl(data_table,channel):
                    product_param_dict['产品型号'] = producttype
                if categorycode in crawl_category_list:
                    try:
-                        crawl_param_dict = crawler.crawl_zol(producttype)
+                        if thebrand != '没有对应指数品牌':#爬取添加品牌
+                            producttype_new = f"{thebrand} {producttype}"#爬取添加品牌
+                            crawl_param_dict = crawler.crawl_zol(producttype_new)#爬取添加品牌
+                        else:
+                            crawl_param_dict = crawler.crawl_zol(producttype)#爬取添加品牌
                    except TypeError:
                        crawl_param_dict = {}
                else:
@@ -899,16 +977,19 @@ class crawl_data_fetch():

    def get_crawl_sku(self):
        if self.channel in ['JD','SN','GM']:
-            self.sql_crawl.cursor.execute(f"select product_id from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and source='{self.channel}'")
+            self.sql_crawl.cursor.execute(f"select product_id,sku from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and source='{self.channel}'")
        else:
-            self.sql_crawl.cursor.execute(f"select product_id from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and (source <> 'JD' and source <> 'GM' and source <>'SN')")
+            self.sql_crawl.cursor.execute(f"select product_id,sku from product_all where productcode is null and state='1' and (ziying='1' or ziying is null) and (dingzhi='1' or ziying is null) and (source <> 'JD' and source <> 'GM' and source <>'SN')")
        sku_data = self.sql_crawl.cursor.fetchall()
+        product_id_list = []
        sku_list = []
-        for sku in sku_data:
-            sku = sku[0]
+        for element in sku_data:
+            product_id = element[0]
+            sku = element[1]
+            product_id_list.append(product_id)
            sku_list.append(sku)
-        sku_set = list(set(sku_list))
-        return sku_set
+        
+        return product_id_list,sku_list
    
    def get_data_crawl(self, product_id):
        self.sql_crawl.cursor.execute(f"select channel_sku,brand,product_name,channel_product_classify,create_time,channel_id,id,channel_product_id from product where product_id='{product_id}'")
@@ -963,11 +1044,16 @@ class crawl_data_fetch():
        return data

    def run(self):
-        sku_list = self.get_crawl_sku()
+        product_id_list,sku_list = self.get_crawl_sku()
        data_dict = {}
        for i in range(len(sku_list)):
            try:
-                data = self.get_data_crawl(sku_list[i])
+                sku = sku_list[i]
+                self.sql_crawl.cursor.execute(f"select count(sku) from product_all where sku='{sku}'")
+                sku_count = self.sql_crawl.cursor.fetchone()[0]
+                if sku_count > 1:
+                    continue
+                data = self.get_data_crawl(product_id_list[i])
                data_dict[data[0]] = {
                    '品牌':data[1],
                    '指数品牌':data[12],
@@ -999,13 +1085,14 @@ class crawl_data_fetch():
            return False

 def crawl_data_run():
+    os.chdir(r'/root/program/newProductCheck/online_progrom/code/API_data')
    Get_new()
    check_and_match()#张楷部分。
    thread_JD = myThread_crawl('JD')
    thread_GM = myThread_crawl('GM')
    thread_SN = myThread_crawl('SN')
    thread_OTHERS = myThread_crawl('OTHERS')
-
+'''
    thread_JD.start()
    thread_GM.start()
    thread_SN.start()
@@ -1015,7 +1102,7 @@ def crawl_data_run():
    thread_GM.join()
    thread_SN.join()
    thread_OTHERS.join()
-
+'''
 if __name__ == '__main__':
    '''
    thread_DL = myThread('DL')