Commit a283447e authored by sanlu's avatar sanlu

爬取结束后,直接调用crawl_data_run()即可

parent 6cc425dc
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 19 10:30:51 2020
@author: Administrator
"""
import pymssql
import pandas as pd
from lxml import etree
import re
import requests
import json
import time
import datetime
import os
def Get_new(): #获取未放入product_all表中数据,并将数据存入product_all表中.
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='reverse_data',autocommit=True)
cur = conn.cursor()
cur.execute("select * from product where isdo is null ")
#cur.execute("SELECT * FROM product_zh where state = '1' and productcode is NULL and remark is null and isdo='0'")
res = [item for item in cur.fetchall()]
df = pd.DataFrame(res,columns=[tuple[0] for tuple in cur.description])
for i in range(len(df)):
#i = 1
dt = df.loc[i]
dt_id = dt.product_id
dt_sku = dt.channel_sku
dt_name_ = dt.product_name
if '\'' in dt_name_ :
dt_name = dt_name_.replace('\'','‘')
else:
dt_name = dt_name_
dt_sub = dt.channel_product_classify
dt_brand = dt['brand'].strip('')
dt_url = dt.channel_product_id
dt_sku = dt.channel_sku
dt_frm = dt.channel_id
cur.execute(f"update product set isdo='1' where product_id = '{dt_id}'")#标识符
in_product_all_sql = f"insert into product_all(sku,name,brand,category,url,source,product_id)\
values ('{dt_sku}','{dt_name}','{dt_brand}','{dt_sub}','{dt_url}','{dt_frm}','{dt_id}')"
cur.execute(in_product_all_sql)
conn.close()
cur.close()
return None
def get_reponse(session,url,headers):
'''
deal timeout request
'''
network_status = True
try:
response = session.get(url, headers=headers, timeout=5)
if response.status_code == 200:
return response
except:
network_status = False
if network_status == False:
'''timeout'''
for i in range(1, 10):
print('请求超时,第%s次重复请求' % i)
try:
response = session.get(url, headers=headers, timeout=5)
if response.status_code == 200:
return response
except:
continue
return -1
def check_and_match():
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
session = requests.Session()
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='reverse_data',autocommit=True)
cur = conn.cursor()
cur.execute("select * from product_all where state is null ")
ress = [item for item in cur.fetchall()]
all_data = pd.DataFrame(ress,columns=[tuple[0] for tuple in cur.description])
for i in range(len(all_data)):
#i = 0
state_in = '未成功处理'
ziying_in = '未成功处理'
dingzhi_in = '未成功处理'
dh = all_data.loc[i]
dh_id = dh.id
dh_url = dh.url
dh_frm = dh.source
dh_sku = dh.sku
if 'JD' in dh_frm or 'SN' in dh_frm or 'GM' in dh_frm :
if "JD" in str(dh_frm):
try:
#dh_url = 'https://item.jd.com/100000483493.html'
try_ = session.get(dh_url,headers=headers)
sku = dh_sku
url = "https://p.3.cn/prices/mgets?skuIds=" + str(sku)
r = session.get(url,headers=headers).json()
jd_price = r[0]['p']
if jd_price == '-1.00':
state_in = '0'
else:
state_in = '1'
#main_url_ = "https://item.jd.com/" + sku + ".html"
r_ = session.get(dh_url,headers=headers)
html = etree.HTML(r_.text)
ziying = html.xpath("//div[@class='name goodshop EDropdown']/em/text()")
if "自营" in str(ziying):
ziying_in = '1'
name = html.xpath(
"//div[@class='sku-name']/text()")
if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)) or ("军迷"in str(name)) or ("携行具"in str(name)):
dingzhi_in = '0'
else:
dingzhi_in = '1'
url = "https://c0.3.cn/stock?skuId="+str(sku)+"&area=1_2901_2906_0&cat=9987,653,655"
r = get_reponse(session,url,headers)
if r == -1:
state = '0'
else:
r.encoding='gbk'
is_purchase = json.loads(r.text)
try:
if "无货" in is_purchase['stock']['stockDesc'] or "无货" in is_purchase['stock']['StockStateName']:
state_in = '0'
else:
state_in = '1'
except:
if "无货" in is_purchase['StockStateName']:
state_in = '0'
else:
state_in = '1'
else:
ziying_in = '0'
except:
state_in = '5'
ziying_in = '5'
dingzhi_in = '5'
elif "GM" in str(dh_frm):
try:
r = session.get(dh_url,headers=headers)
html = etree.HTML(r.text)
content = html.xpath("//script[contains(text(),'gomePrice')]/text()")[0]
ziying = html.xpath(
"//span[@class='identify']/text()")
if len(ziying) == 1:
ziying_in = '1'
name = html.xpath(
"//*[@id='gm-prd-main']/div[1]/h1/text()")
if ("定制"in str(name)) or ("防弹"in str(name)) or ("射击"in str(name)) or ("订制"in str(name)) or ("卫星"in str(name)) or ("靶"in str(name)) or ("企业定制"in str(name)):
dingzhi_in = '0'
else:
dingzhi_in = '1'
url = "https://ss.gome.com.cn/item/v1/d/m/store/unite/"+str(sku)+"/N/11010200/110102002/1/null/flag/item/allStores?callback=allStores"
r = session.get(url,headers=headers)
content = r.text.replace('allStores(','')
content = content.replace(')','')
content = json.loads(content)
wuhuo = content['result']['stock']['status']
if wuhuo == False:
state_in = '0'
else:
state_in = '1'
else:
ziying_in = '0'
except:
state_in = '5'
ziying_in = '5'
dingzhi_in = '5'
elif "SN" in str(dh_frm):
try:
#dh_url = 'https://product.suning.com/0000000000/10118850129.html'
r = session.get(dh_url,headers=headers)
html = etree.HTML(r.text)
daaa = r.text
str2 = html.xpath("//input[@id='curPartNumber']/@value")[0]
ziying1 = html.xpath("//div[@class='proinfo-title']/h1/span/i/text()")
ziying2 = html.xpath("//h1[@id='itemDisplayName']/span/text()")
youhuo_ = re.findall("id=\"ie7_onsale\" >(.*?)<i",daaa)
if "自营" in ziying1 or "自营" in ziying2:
ziying_in = '1'
daohuo = html.xpath("//a[@id='tellMe']/span/text()")
url_json = f'https://product.suning.com/pds-web/ajax/itemUniqueInfo_{str(str2)}_0000000000.html'
response_json = session.get(url_json,headers=headers)
json_data = json.loads(response_json.text)
itemDetail = json_data["itemDetail"]
try:
isPublished = itemDetail["isPublished"]
except:
isPublished = '0'
product_name = itemDetail["cmmdtyTitle"]
if isPublished == '1':
state_in = '1'
if ("定制"in str(product_name)) or ("防弹"in str(product_name)) or ("射击"in str(product_name)) \
or ("订制"in str(product_name)) or ("卫星"in str(product_name)) \
or ("靶"in str(product_name)) or ("企业定制"in str(product_name)) \
or ("军迷"in str(product_name)) or ("携行具"in str(product_name)):
dingzhi_in = '0'
else:
dingzhi_in = '1'
else:
state_in = '0'
else:
ziying_in = '0'
except:
state_in = '5'
ziying_in = '5'
dingzhi_in = '5'
else:
state_in = '1'
ziying_in = '1'
dingzhi_in = '1'
#print(str(i)+'+++++++')
#print('有货 '+str(state_in))
#print('自营 '+str(ziying_in))
#print('定制 '+str(dingzhi_in))
cur.execute(f"update product_all set state='{state_in}',ziying='{ziying_in}',dingzhi = '{dingzhi_in}' where id = '{dh_id}' ")
conn.close()
cur.close()
#获取SKU进行匹配
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='reverse_data',autocommit=True)
cur = conn.cursor()
conn = pymssql.connect(host='123.56.115.207', user='zgcprice3311',password='admin@2018@)!*',database='ZI_DataBase',autocommit=True)
cursor = conn.cursor()
match_data_sql = "select * from product_all where source in ('JD','SN','GM')"
cur.execute(match_data_sql)
resss = [item for item in cur.fetchall()]
match_data = pd.DataFrame(resss,columns=[tuple[0] for tuple in cur.description])
for j in range(len(match_data)):
#j=0
#dg_sku = '100000483493'
#dg_frm = 'JD'
dg = match_data.loc[j]
dg_id = dg.id
dg_sku = dg.sku
dg_frm = dg.source
if dg_frm == 'SN':
dg_skuu = '0000000000/'+dg_sku
check_sql = f"select productcode from productcode_sku where frm = '{dg_frm}' and sku='{dg_sku}'"
cursor.execute(check_sql)
out_data = cursor.fetchall()
if len(out_data) == 0:
check_sql_1 = f"select productcode from productcode_sku where frm = '{dg_frm}' and sku= '{dg_skuu}'"
cursor.execute(check_sql_1)
out_data = cursor.fetchall()
else:
out_code = out_data
if len(out_data) == 0 :
out_code = '未匹配上'
elif len(out_data) > 1:
out_code = '一个SKU匹配多个编码'
else:
out_code = out_data[0][0]
else:
check_sql = f"select productcode from productcode_sku where frm = '{dg_frm}' and sku='{dg_sku}'"
cursor.execute(check_sql)
out_data = cursor.fetchall()
if len(out_data) == 0 :
out_code = '未匹配上'
elif len(out_data) > 1:
out_code = '一个SKU匹配多个编码'
else:
out_code = out_data[0][0]
if len(out_code) == 13 or out_code == '一个SKU匹配多个编码' :
update_sql = f"update product_all set productcode='{out_code}' where id = '{dg_id}'"
cur.execute(update_sql)
else:
pass
print('完成')
\ No newline at end of file
......@@ -16,6 +16,7 @@ from lstm_predict import LSTMNER
import os
from ZOL_Crawler import CRAWLER
import threading
from API_ALL import Get_new,check_and_match
exitFlag = 0
......@@ -998,6 +999,8 @@ class crawl_data_fetch():
return False
def crawl_data_run():
Get_new()
check_and_match()#张楷部分。
thread_JD = myThread_crawl('JD')
thread_GM = myThread_crawl('GM')
thread_SN = myThread_crawl('SN')
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment