Commit 44ee201c authored by huangziyu's avatar huangziyu

1

parent 3cd569a4
# -*- coding: utf-8 -*-
import redis
import time
import requests import requests
import json import json
import pymysql import pymysql
import re import re
# redis
redis_dev_path = '172.17.148.70'
redis_dev_port = 6379
redis_dev_db = 6
redis_con_pro = redis.Redis(redis_dev_path, redis_dev_port, db=redis_dev_db, decode_responses=True)
mysql_zdzs_proxy_host = '59.110.219.171' mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex' mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019' mysql_zdzs_proxy_password = 'zgcprice2019'
...@@ -35,57 +45,82 @@ def mysql_zdzs_proxy(sql=None): ...@@ -35,57 +45,82 @@ def mysql_zdzs_proxy(sql=None):
from urllib.parse import quote from urllib.parse import quote
def get_time_str():
return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# 去掉符号
def remove_the_symbol(content=None):
if content is None:
return content
content = re.findall('([\u4e00-\u9fa5A-Za-z0-9]*)', content.upper())
return ''.join(content)
def remove_html_tags(text): def remove_html_tags(text):
# 使用正则表达式删除 HTML 标签 # 使用正则表达式删除 HTML 标签
clean_text = re.sub(r'<[^>]+>', '', text) clean_text = re.sub(r'<[^>]+>', '', text)
return clean_text return clean_text
url_data = mysql_zdzs_proxy( JD_SEARCH_REQUEST_COUNT_MAX_TEST = {}
"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '100' and (id like '%4' or id like '%5' or id like '%6' or id like '%7' ) limit 10000") while True:
for row in url_data: count_key = remove_the_symbol(get_time_str()[0:10])
url = row['url'] time.sleep(5)
keyword = url.replace('https://search.jd.com/search?keyword=', "").replace('&enc=utf-8', "") url_data = mysql_zdzs_proxy(
print(keyword) "SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '100' and (id like '%1' or id like '%2' or id like '%3' ) limit 10000")
keyword = quote(keyword) # "SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '100' and (id like '%4' or id like '%5' or id like '%6' or id like '%7' ) limit 1000")
for row in url_data:
url = f"https://api.m.jd.com/api?functionId=pc_search_adv_Search&appid=search-pc-java&client=pc&clientVersion=1.0.0&uuid=143920055.1664177649338438720918.1664177649.1710402971.1710474746.407&loginType=3&t=1710474777240&body=%7B%22area%22:%221%22,%22enc%22:%22utf-8%22,%22keyword%22:%22{keyword}%22,%22adType%22:7,%22page%22:%221%22,%22ad_ids%22:%22291:19%22,%22xtest%22:%22new_search%22%7D&x-api-eid-token=jdd036KK7GVN2A5VXFOZVHAPY25WX7HXJAN62PDVL6JU6EOWVUQJMGEAWKRGEDHIE367RY5TVUPVMRDVFP3CN276H7UOWKYAAAAMOIA6VIPAAAAAACT7MJYE5DSZYVQX" print(JD_SEARCH_REQUEST_COUNT_MAX_TEST)
url = row['url']
payload = {} keyword = url.replace('https://search.jd.com/search?keyword=', "").replace('&enc=utf-8', "")
headers = { print(keyword)
'origin': 'https://search.jd.com', keyword = quote(keyword)
'Cookie': 'shshshfpa=302d4458-a337-1470-3d87-19967a3d383c-1664177595; __jdu=1664177649338438720918; pinId=m_q_aGs3ba1NmxbgWiZkb7V9-x-f3wj7; jcap_dvzw_fp=6hXnN9RzY6q93f9CIqpUUYVuWPqw8tbJ848WPVvfBOjJmE9TIGJb1Yf5gt4E-YG79w4dtC-1GyVyjEGpw5-w2w; shshshfpx=302d4458-a337-1470-3d87-19967a3d383c-1664177595; shshshfp=1f6ff101d0ef7e3e106d3b6137763832; pin=jd_54fd27b0d2b22; unick=huangziyu0726; _tp=j3ClGrQ8eyicnottNC9z45B9wgXg9sbTugOWYIqmeB0%3D; _pst=jd_54fd27b0d2b22; autoOpenApp_downCloseDate_auto=1705733792609_1800000; autoOpenApp_downCloseDate_autoOpenApp_autoPromptly=1706681658897_1; _gcl_au=1.1.1699672507.1709478014; mba_muid=1664177649338438720918; user-key=88908ac9-9cfd-4652-9b2f-f1b9ecad98ea; unpl=JF8EALJnNSttDBhXBB0KSBRDHFsDWw8MQx8Dbm8EA15dHlYDHlAdFRJ7XlVdWBRKFR9uYxRXX1NOVw4aASsSEXteU11bD00VB2xXVgQFDQ8WUUtBSUt-S1tUWFkNTRYAbGUEZG1bS2QFGjIbFRdOXVJYWg1DFQFmYgddW1tKXAUSMhoiF3ttZFpVAU0QAl9mNVVtGh8IABgEHxMXBl1TWVgITREEam8HVlRdSV0DGAMTEhl7XGRd; __jdv=229668127|baidu-search|t_262767352_baidusearch|cpc|171754702230_0_eb2079b7bf7f4c5991090f24d374c763|1709653324334; mt_xid=V2_52007VwMVVVhYVFwYTBFeB24GEFtbW1NSH0ApWFA1VBZSCV1OCUtBHUAAMAFCTlQLU1wDThBcVWYGQQBbCwVbL0oYXwB7AhBOXF9DWx5CGlkOZwMiUG1bYlkeTxFZAFcAFVJb; areaId=1; ipLoc-djd=1-2800-55811-0; RT="z; token=0331448fb40d80da734aceff7d48bafa,3,950263; __tk=uzXDYz2FXUJ0uzu0XsqFZsvTvsk0XpbTvSnoXzJ5uzX,3,950263; 3AB9D23F7A4B3CSS=jdd036KK7GVN2A5VXFOZVHAPY25WX7HXJAN62PDVL6JU6EOWVUQJMGEAWKRGEDHIE367RY5TVUPVMRDVFP3CN276H7UOWKYAAAAMOIA6VIPAAAAAACT7MJYE5DSZYVQX; _gia_d=1; jsavif=1; wlfstk_smdl=rtj4txsjw59ztpv9fsn6yxbvt6ty3wiu; TrackID=1C2yN2jHh6Wg-NhNJLzHHqcnZDWxML5PG3tsIgqmxor1OVm1c_lEWSZjqo9H4QEOUaP-l0Y7EKLOnfjHC2RU7Zdj3DG6mRzJkXqQdNCcnpYM; thor=237C3D3A4A70C62901A697588128F575FE0CEF91A4213E671F253B27AF7FB601536685D63E703D077E709159409EBEC0A3B565591643E47E6D1B0A57C5DFFA4F2A9ECC9ADFD979740CDD4D1842A5CCF28165BD1E1640E4BEA87287FF3023299B2A43DCC6FB3EAA976E79787E1A14A8622CA12CA512495BF85C7F69E30C2E85066B6369585D02B7A58F37806B5C4A07D5DD8E07D1536CF0A7BC676D403012149B; flash=2_qECj__VcDB1S4gEaqS6lrKKvZR4TCQeCSlLdk3QgDH49Q00GmtDvwTO4fhRVPGj_EQMRN_rL8wOk4cX0er-Ohvzpua_6-Cmq4DDcx0Kw5ek*; ceshi3.com=000; mba_sid=17104747515401167401247798463.1; __jd_ref_cls=LoginDisposition_Go; x-rp-evtoken=N-nAb5Oj6OS1u8hkvixIgNLm8epG1xAIhLtiT01VwscTFkwuzJmUObwZwoww3HGIwRv7KyWgxiTC2ghErjsmtxK1DyAkyLbcd5ne9cxLAiPxp1q6nvjZScJDfpNzXE1k5ihL2oqXaUM_PwNW2jZaINB6AUpZgSu1KOrAs0nlGROPmgZMyeVGQG3w1hLHt5rvi3Ylmy-qdixDl4Ds4t92oegs-EbHvqtsTNHNnJcMjG0%3D; __jda=143920055.1664177649338438720918.1664177649.1710402971.1710474746.407; __jdc=143920055; shshshfpb=BApXeUi81Q-tA2-pIUXD6xI9OcOHB-4nmByZWcrpX9xJ1MsQPq4O2; 3AB9D23F7A4B3C9B=6KK7GVN2A5VXFOZVHAPY25WX7HXJAN62PDVL6JU6EOWVUQJMGEAWKRGEDHIE367RY5TVUPVMRDVFP3CN276H7UOWKY; __jdb=143920055.6.1664177649338438720918|407.1710474746',
'User-Agent': 'Apifox/1.0.0 (https://apifox.com)', url = f"https://api.m.jd.com/api?functionId=pc_search_adv_Search&appid=search-pc-java&client=pc&clientVersion=1.0.0&uuid=143920055.1664177649338438720918.1664177649.1710402971.1710474746.407&loginType=3&t=1710474777240&body=%7B%22area%22:%221%22,%22enc%22:%22utf-8%22,%22keyword%22:%22{keyword}%22,%22adType%22:7,%22page%22:%221%22,%22ad_ids%22:%22291:19%22,%22xtest%22:%22new_search%22%7D&x-api-eid-token=jdd036KK7GVN2A5VXFOZVHAPY25WX7HXJAN62PDVL6JU6EOWVUQJMGEAWKRGEDHIE367RY5TVUPVMRDVFP3CN276H7UOWKYAAAAMOIA6VIPAAAAAACT7MJYE5DSZYVQX"
'Accept': '*/*',
'Host': 'api.m.jd.com', payload = {}
'Connection': 'keep-alive' headers = {
} 'origin': 'https://search.jd.com',
'Cookie': 'shshshfpa=302d4458-a337-1470-3d87-19967a3d383c-1664177595; __jdu=1664177649338438720918; pinId=m_q_aGs3ba1NmxbgWiZkb7V9-x-f3wj7; jcap_dvzw_fp=6hXnN9RzY6q93f9CIqpUUYVuWPqw8tbJ848WPVvfBOjJmE9TIGJb1Yf5gt4E-YG79w4dtC-1GyVyjEGpw5-w2w; shshshfpx=302d4458-a337-1470-3d87-19967a3d383c-1664177595; shshshfp=1f6ff101d0ef7e3e106d3b6137763832; pin=jd_54fd27b0d2b22; unick=huangziyu0726; _tp=j3ClGrQ8eyicnottNC9z45B9wgXg9sbTugOWYIqmeB0%3D; _pst=jd_54fd27b0d2b22; autoOpenApp_downCloseDate_auto=1705733792609_1800000; autoOpenApp_downCloseDate_autoOpenApp_autoPromptly=1706681658897_1; _gcl_au=1.1.1699672507.1709478014; mba_muid=1664177649338438720918; user-key=88908ac9-9cfd-4652-9b2f-f1b9ecad98ea; unpl=JF8EALJnNSttDBhXBB0KSBRDHFsDWw8MQx8Dbm8EA15dHlYDHlAdFRJ7XlVdWBRKFR9uYxRXX1NOVw4aASsSEXteU11bD00VB2xXVgQFDQ8WUUtBSUt-S1tUWFkNTRYAbGUEZG1bS2QFGjIbFRdOXVJYWg1DFQFmYgddW1tKXAUSMhoiF3ttZFpVAU0QAl9mNVVtGh8IABgEHxMXBl1TWVgITREEam8HVlRdSV0DGAMTEhl7XGRd; __jdv=229668127|baidu-search|t_262767352_baidusearch|cpc|171754702230_0_eb2079b7bf7f4c5991090f24d374c763|1709653324334; mt_xid=V2_52007VwMVVVhYVFwYTBFeB24GEFtbW1NSH0ApWFA1VBZSCV1OCUtBHUAAMAFCTlQLU1wDThBcVWYGQQBbCwVbL0oYXwB7AhBOXF9DWx5CGlkOZwMiUG1bYlkeTxFZAFcAFVJb; areaId=1; ipLoc-djd=1-2800-55811-0; RT="z; token=0331448fb40d80da734aceff7d48bafa,3,950263; __tk=uzXDYz2FXUJ0uzu0XsqFZsvTvsk0XpbTvSnoXzJ5uzX,3,950263; 3AB9D23F7A4B3CSS=jdd036KK7GVN2A5VXFOZVHAPY25WX7HXJAN62PDVL6JU6EOWVUQJMGEAWKRGEDHIE367RY5TVUPVMRDVFP3CN276H7UOWKYAAAAMOIA6VIPAAAAAACT7MJYE5DSZYVQX; _gia_d=1; jsavif=1; wlfstk_smdl=rtj4txsjw59ztpv9fsn6yxbvt6ty3wiu; TrackID=1C2yN2jHh6Wg-NhNJLzHHqcnZDWxML5PG3tsIgqmxor1OVm1c_lEWSZjqo9H4QEOUaP-l0Y7EKLOnfjHC2RU7Zdj3DG6mRzJkXqQdNCcnpYM; thor=237C3D3A4A70C62901A697588128F575FE0CEF91A4213E671F253B27AF7FB601536685D63E703D077E709159409EBEC0A3B565591643E47E6D1B0A57C5DFFA4F2A9ECC9ADFD979740CDD4D1842A5CCF28165BD1E1640E4BEA87287FF3023299B2A43DCC6FB3EAA976E79787E1A14A8622CA12CA512495BF85C7F69E30C2E85066B6369585D02B7A58F37806B5C4A07D5DD8E07D1536CF0A7BC676D403012149B; flash=2_qECj__VcDB1S4gEaqS6lrKKvZR4TCQeCSlLdk3QgDH49Q00GmtDvwTO4fhRVPGj_EQMRN_rL8wOk4cX0er-Ohvzpua_6-Cmq4DDcx0Kw5ek*; ceshi3.com=000; mba_sid=17104747515401167401247798463.1; __jd_ref_cls=LoginDisposition_Go; x-rp-evtoken=N-nAb5Oj6OS1u8hkvixIgNLm8epG1xAIhLtiT01VwscTFkwuzJmUObwZwoww3HGIwRv7KyWgxiTC2ghErjsmtxK1DyAkyLbcd5ne9cxLAiPxp1q6nvjZScJDfpNzXE1k5ihL2oqXaUM_PwNW2jZaINB6AUpZgSu1KOrAs0nlGROPmgZMyeVGQG3w1hLHt5rvi3Ylmy-qdixDl4Ds4t92oegs-EbHvqtsTNHNnJcMjG0%3D; __jda=143920055.1664177649338438720918.1664177649.1710402971.1710474746.407; __jdc=143920055; shshshfpb=BApXeUi81Q-tA2-pIUXD6xI9OcOHB-4nmByZWcrpX9xJ1MsQPq4O2; 3AB9D23F7A4B3C9B=6KK7GVN2A5VXFOZVHAPY25WX7HXJAN62PDVL6JU6EOWVUQJMGEAWKRGEDHIE367RY5TVUPVMRDVFP3CN276H7UOWKY; __jdb=143920055.6.1664177649338438720918|407.1710474746',
response = requests.request("GET", url, headers=headers, data=payload) 'User-Agent': 'Apifox/1.0.0 (https://apifox.com)',
if response.text == "": 'Accept': '*/*',
print("需要刷新") 'Host': 'api.m.jd.com',
continue 'Connection': 'keep-alive'
response_json = json.loads(response.text) }
print(response_json) try:
dataList = [] response = requests.request("GET", url, headers=headers, data=payload, timeout=5)
if response_json != {}: except:
for sku_info in response_json['291']: continue
shop_name = '' if response.text == "":
if 'shop_link' in sku_info and 'shop_name' in sku_info['shop_link']: print("需要刷新")
shop_name = sku_info['shop_link']['shop_name'] time.sleep(1)
dataList.append( continue
{ response_json = json.loads(response.text)
'is_self_operate_div': '', print(response_json)
'item_url': 'https://item.jd.com/' + str(sku_info['sku_id']) + '.html', dataList = []
'sku_id': str(sku_info['sku_id']), if response_json != {}:
'shop_name': shop_name, for sku_info in response_json['291']:
'sku_name': remove_html_tags(sku_info['ad_title']), shop_name = ''
'sku_price': sku_info['sku_price'], if 'shop_link' in sku_info and 'shop_name' in sku_info['shop_link']:
} shop_name = sku_info['shop_link']['shop_name']
) dataList.append(
insert_sql = f""" {
INSERT INTO DMP_SEARCH_DATA(id,result,mallId) 'is_self_operate_div': '',
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}') 'item_url': 'https://item.jd.com/' + str(sku_info['sku_id']) + '.html',
""" 'sku_id': str(sku_info['sku_id']),
if response_json != {}: 'shop_name': shop_name,
mysql_zdzs_proxy(insert_sql) 'sku_name': remove_html_tags(sku_info['ad_title']),
'sku_price': sku_info['sku_price'],
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
if response_json != {}:
mysql_zdzs_proxy(insert_sql)
redis_con_pro.sadd("JD_SEARCH_REQUEST_COUNT_MAX_TEST:" + count_key, row['id'])
if count_key not in JD_SEARCH_REQUEST_COUNT_MAX_TEST:
JD_SEARCH_REQUEST_COUNT_MAX_TEST[count_key] = 0
JD_SEARCH_REQUEST_COUNT_MAX_TEST[count_key] += 1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment