爬取结束后，直接调用crawl_data_run()即可

a283447e · sanlu · 6cc425dc · a283447e · a283447e
Commit a283447e authored Jan 19, 2020 by sanlu
Show whitespace changes
Inline Side-by-side

Showing with 298 additions and 0 deletions

API_ALL.py API_ALL.py +295 -0

main_merge.py main_merge.py +3 -0

No files found.
--- a/API_ALL.py
+++ b/API_ALL.py
+# -*- coding: utf-8 -*-
+"""
+Created on Sun Jan 19 10:30:51 2020
+@author: Administrator
+"""
+import pymssql
+import pandas as pd
+from lxml import etree
+import re
+import requests
+import json
+import time
+import datetime
+import os
+def Get_new():   #获取未放入product_all表中数据,并将数据存入product_all表中.
+    conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='reverse_data',autocommit=True)
+    cur = conn.cursor()
+    cur.execute("select * from product where  isdo is null ")
+    #cur.execute("SELECT * FROM product_zh where state = '1' and productcode is NULL and remark is null and isdo='0'")
+    res = [item for item in cur.fetchall()]
+    df = pd.DataFrame(res,columns=[tuple[0] for tuple in cur.description])
+    for i in range(len(df)):
+        #i = 1
+        dt = df.loc[i]
+        dt_id = dt.product_id
+        dt_sku = dt.channel_sku
+        dt_name_ = dt.product_name
+        if '\'' in dt_name_ :
+            dt_name = dt_name_.replace('\'','‘')
+        else:
+            dt_name = dt_name_
+        dt_sub = dt.channel_product_classify
+        dt_brand = dt['brand'].strip('')
+        dt_url = dt.channel_product_id
+        dt_sku = dt.channel_sku
+        dt_frm = dt.channel_id
+        cur.execute(f"update product set isdo='1' where product_id = '{dt_id}'")#标识符
+        in_product_all_sql = f"insert into product_all(sku,name,brand,category,url,source,product_id)\
+                            values ('{dt_sku}','{dt_name}','{dt_brand}','{dt_sub}','{dt_url}','{dt_frm}','{dt_id}')"
+        cur.execute(in_product_all_sql)
+    conn.close()
+    cur.close()
+    return None
+def get_reponse(session,url,headers):
+    '''
+    deal timeout request
+    '''
+    network_status = True 
+    try:
+        response = session.get(url, headers=headers, timeout=5)
+        if response.status_code == 200:
+            return response
+    except:
+        network_status = False 
+        if network_status == False:
+            '''timeout'''
+            for i in range(1, 10):
+                print('请求超时，第%s次重复请求' % i)
+                try:    
+                    response = session.get(url, headers=headers, timeout=5)
+                    if response.status_code == 200:
+                        return response
+                except:
+                    continue
+    return -1   
+def check_and_match():
+    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
+    session = requests.Session()
+    conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='reverse_data',autocommit=True)
+    cur = conn.cursor()
+    cur.execute("select * from product_all where  state is null ")
+    ress = [item for item in cur.fetchall()]
+    all_data = pd.DataFrame(ress,columns=[tuple[0] for tuple in cur.description])
+    for i in range(len(all_data)):
+        #i = 0
+        state_in = '未成功处理'
+        ziying_in = '未成功处理'
+        dingzhi_in = '未成功处理'
+        dh = all_data.loc[i]
+        dh_id = dh.id
+        dh_url = dh.url
+        dh_frm = dh.source
+        dh_sku = dh.sku
+        if 'JD' in dh_frm or 'SN' in dh_frm or 'GM' in dh_frm :
+            if "JD" in str(dh_frm):
+                try:
+                    #dh_url = 'https://item.jd.com/100000483493.html'
+                    try_ = session.get(dh_url,headers=headers)
+                    sku = dh_sku
+                    url = "https://p.3.cn/prices/mgets?skuIds=" + str(sku)
+                    r = session.get(url,headers=headers).json()
+                    jd_price = r[0]['p']
+                    if jd_price == '-1.00':
+                        state_in = '0'
+                    else:
+                        state_in = '1'
+                        #main_url_ = "https://item.jd.com/" + sku + ".html"
+                        r_ = session.get(dh_url,headers=headers)
+                        html = etree.HTML(r_.text)
+                        ziying = html.xpath("//div[@class='name goodshop EDropdown']/em/text()")
+                        if "自营" in str(ziying):
+                            ziying_in = '1'
+                            name = html.xpath(
+                                    "//div[@class='sku-name']/text()")
+                            if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)) or ("军迷"in str(name)) or ("携行具"in str(name)):
+                                dingzhi_in = '0'
+                            else:
+                                dingzhi_in = '1'
+                                url = "https://c0.3.cn/stock?skuId="+str(sku)+"&area=1_2901_2906_0&cat=9987,653,655"
+                                r = get_reponse(session,url,headers)
+                                if r == -1:
+                                    state = '0'
+                                else:
+                                    r.encoding='gbk'
+                                    is_purchase = json.loads(r.text)
+                                    try:
+                                        if  "无货" in is_purchase['stock']['stockDesc'] or "无货" in is_purchase['stock']['StockStateName']:
+                                            state_in = '0'
+                                        else:
+                                            state_in = '1'
+                                    except:
+                                        if  "无货" in is_purchase['StockStateName']:
+                                            state_in = '0'
+                                        else:
+                                            state_in = '1'
+                        else:
+                            ziying_in = '0'
+                except:
+                    state_in = '5'
+                    ziying_in = '5'
+                    dingzhi_in = '5'
+            elif "GM" in str(dh_frm):
+                try:
+                    r = session.get(dh_url,headers=headers)
+                    html = etree.HTML(r.text)
+                    content = html.xpath("//script[contains(text(),'gomePrice')]/text()")[0]
+                    ziying = html.xpath(
+                    "//span[@class='identify']/text()")
+                    if len(ziying) == 1:
+                        ziying_in = '1'
+                        name = html.xpath(
+                                "//*[@id='gm-prd-main']/div[1]/h1/text()")
+                        if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)):
+                            dingzhi_in = '0'
+                        else:
+                            dingzhi_in = '1'
+                            url = "https://ss.gome.com.cn/item/v1/d/m/store/unite/"+str(sku)+"/N/11010200/110102002/1/null/flag/item/allStores?callback=allStores"
+                            r = session.get(url,headers=headers)
+                            content = r.text.replace('allStores(','')
+                            content = content.replace(')','')
+                            content = json.loads(content)
+                            wuhuo = content['result']['stock']['status']
+                            if wuhuo == False:
+                                state_in = '0'
+                            else:
+                                state_in = '1'
+                    else:
+                        ziying_in = '0'
+                except:
+                    state_in = '5'
+                    ziying_in = '5'
+                    dingzhi_in = '5'
+            elif "SN" in str(dh_frm):
+                try:
+                    #dh_url = 'https://product.suning.com/0000000000/10118850129.html'
+                    r = session.get(dh_url,headers=headers)
+                    html = etree.HTML(r.text)
+                    daaa = r.text
+                    str2 = html.xpath("//input[@id='curPartNumber']/@value")[0]
+                    ziying1 = html.xpath("//div[@class='proinfo-title']/h1/span/i/text()")
+                    ziying2 = html.xpath("//h1[@id='itemDisplayName']/span/text()")
+                    youhuo_ = re.findall("id=\"ie7_onsale\" >(.*?)<i",daaa)
+                    if "自营" in ziying1 or "自营" in ziying2:
+                        ziying_in = '1'
+                        daohuo = html.xpath("//a[@id='tellMe']/span/text()")
+                        url_json = f'https://product.suning.com/pds-web/ajax/itemUniqueInfo_{str(str2)}_0000000000.html'
+                        response_json = session.get(url_json,headers=headers)
+                        json_data = json.loads(response_json.text)
+                        itemDetail = json_data["itemDetail"]
+                        try:
+                            isPublished = itemDetail["isPublished"]
+                        except:
+                            isPublished = '0'
+                        product_name = itemDetail["cmmdtyTitle"]
+                        if isPublished == '1':
+                            state_in = '1'
+                            if ("定制"in str(product_name)) or ("防弹"in str(product_name)) or ("射击"in str(product_name)) \
+                                or ("订制"in str(product_name)) or ("卫星"in str(product_name)) \
+                                or ("靶"in str(product_name)) or ("企业定制"in str(product_name)) \
+                                or ("军迷"in str(product_name)) or ("携行具"in str(product_name)):
+                                dingzhi_in = '0'
+                            else:
+                                dingzhi_in = '1'
+                        else:
+                            state_in = '0'
+                    else:
+                        ziying_in = '0'
+                except:
+                    state_in = '5'
+                    ziying_in = '5'
+                    dingzhi_in = '5'
+        else:
+            state_in = '1'
+            ziying_in = '1'
+            dingzhi_in = '1'
+        #print(str(i)+'+++++++')    
+        #print('有货  '+str(state_in))
+        #print('自营  '+str(ziying_in))
+        #print('定制  '+str(dingzhi_in))
+        cur.execute(f"update product_all set state='{state_in}',ziying='{ziying_in}',dingzhi = '{dingzhi_in}' where id = '{dh_id}' ")
+    conn.close()
+    cur.close()
+    #获取SKU进行匹配
+    conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='reverse_data',autocommit=True)
+    cur = conn.cursor()
+    conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='ZI_DataBase',autocommit=True)
+    cursor = conn.cursor()
+    match_data_sql = "select * from product_all where source in ('JD','SN','GM')"
+    cur.execute(match_data_sql)
+    resss = [item for item in cur.fetchall()]
+    match_data = pd.DataFrame(resss,columns=[tuple[0] for tuple in cur.description])
+    for j in range(len(match_data)):
+        #j=0
+        #dg_sku = '100000483493'
+        #dg_frm = 'JD'
+        dg = match_data.loc[j]
+        dg_id = dg.id
+        dg_sku = dg.sku
+        dg_frm = dg.source
+        if dg_frm == 'SN':
+            dg_skuu = '0000000000/'+dg_sku
+            check_sql = f"select productcode from productcode_sku where frm = '{dg_frm}' and sku='{dg_sku}'"
+            cursor.execute(check_sql)
+            out_data = cursor.fetchall()
+            if len(out_data) == 0:
+                check_sql_1 = f"select productcode from productcode_sku where frm = '{dg_frm}' and sku= '{dg_skuu}'"
+                cursor.execute(check_sql_1)
+                out_data = cursor.fetchall()
+            else:
+                out_code = out_data
+            if len(out_data) == 0 :
+                out_code = '未匹配上'
+            elif len(out_data) > 1:
+                out_code = '一个SKU匹配多个编码'
+            else:
+                out_code = out_data[0][0]
+        else:
+            check_sql = f"select productcode from productcode_sku where frm = '{dg_frm}' and sku='{dg_sku}'"
+            cursor.execute(check_sql)
+            out_data = cursor.fetchall()
+            if len(out_data) == 0 :
+                out_code = '未匹配上'
+            elif len(out_data) > 1:
+                out_code = '一个SKU匹配多个编码'
+            else:
+                out_code = out_data[0][0]
+        if len(out_code) == 13 or out_code == '一个SKU匹配多个编码' :
+            update_sql = f"update product_all set productcode='{out_code}' where id = '{dg_id}'"
+            cur.execute(update_sql)
+        else:
+            pass
+    print('完成')
\ No newline at end of file
--- a/main_merge.py
+++ b/main_merge.py
@@ -16,6 +16,7 @@ from lstm_predict import LSTMNER
 import os
 from ZOL_Crawler import CRAWLER
 import threading
+from API_ALL import Get_new,check_and_match
 exitFlag = 0
@@ -998,6 +999,8 @@ class crawl_data_fetch():
            return False
 def crawl_data_run():
+    Get_new()
+    check_and_match()#张楷部分。
    thread_JD = myThread_crawl('JD')
    thread_GM = myThread_crawl('GM')
    thread_SN = myThread_crawl('SN')