Commit 44ee201c authored by huangziyu's avatar huangziyu

1

parent 3cd569a4
# -*- coding: utf-8 -*-
import redis
import time
import requests import requests
import json import json
import pymysql import pymysql
import re import re
# redis
redis_dev_path = '172.17.148.70'
redis_dev_port = 6379
redis_dev_db = 6
redis_con_pro = redis.Redis(redis_dev_path, redis_dev_port, db=redis_dev_db, decode_responses=True)
mysql_zdzs_proxy_host = '59.110.219.171' mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex' mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019' mysql_zdzs_proxy_password = 'zgcprice2019'
...@@ -35,15 +45,33 @@ def mysql_zdzs_proxy(sql=None): ...@@ -35,15 +45,33 @@ def mysql_zdzs_proxy(sql=None):
from urllib.parse import quote from urllib.parse import quote
def get_time_str():
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# 去掉符号
def remove_the_symbol(content=None):
if content is None:
return content
content = re.findall('([\u4e00-\u9fa5A-Za-z0-9]*)', content.upper())
return ''.join(content)
def remove_html_tags(text): def remove_html_tags(text):
# 使用正则表达式删除 HTML 标签 # 使用正则表达式删除 HTML 标签
clean_text = re.sub(r'<[^>]+>', '', text) clean_text = re.sub(r'<[^>]+>', '', text)
return clean_text return clean_text
url_data = mysql_zdzs_proxy( JD_SEARCH_REQUEST_COUNT_MAX_TEST = {}
"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '100' and (id like '%4' or id like '%5' or id like '%6' or id like '%7' ) limit 10000") while True:
for row in url_data: count_key = remove_the_symbol(get_time_str()[0:10])
time.sleep(5)
url_data = mysql_zdzs_proxy(
"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '100' and (id like '%1' or id like '%2' or id like '%3' ) limit 10000")
# "SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '100' and (id like '%4' or id like '%5' or id like '%6' or id like '%7' ) limit 1000")
for row in url_data:
print(JD_SEARCH_REQUEST_COUNT_MAX_TEST)
url = row['url'] url = row['url']
keyword = url.replace('https://search.jd.com/search?keyword=', "").replace('&enc=utf-8', "") keyword = url.replace('https://search.jd.com/search?keyword=', "").replace('&enc=utf-8', "")
print(keyword) print(keyword)
...@@ -60,10 +88,13 @@ for row in url_data: ...@@ -60,10 +88,13 @@ for row in url_data:
'Host': 'api.m.jd.com', 'Host': 'api.m.jd.com',
'Connection': 'keep-alive' 'Connection': 'keep-alive'
} }
try:
response = requests.request("GET", url, headers=headers, data=payload) response = requests.request("GET", url, headers=headers, data=payload, timeout=5)
except:
continue
if response.text == "": if response.text == "":
print("需要刷新") print("需要刷新")
time.sleep(1)
continue continue
response_json = json.loads(response.text) response_json = json.loads(response.text)
print(response_json) print(response_json)
...@@ -89,3 +120,7 @@ for row in url_data: ...@@ -89,3 +120,7 @@ for row in url_data:
""" """
if response_json != {}: if response_json != {}:
mysql_zdzs_proxy(insert_sql) mysql_zdzs_proxy(insert_sql)
redis_con_pro.sadd("JD_SEARCH_REQUEST_COUNT_MAX_TEST:" + count_key, row['id'])
if count_key not in JD_SEARCH_REQUEST_COUNT_MAX_TEST:
JD_SEARCH_REQUEST_COUNT_MAX_TEST[count_key] = 0
JD_SEARCH_REQUEST_COUNT_MAX_TEST[count_key] += 1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment