Commit c0fd8dac authored by huangiyu's avatar huangiyu

Merge remote-tracking branch 'origin/master'

parents 01133cc1 23028eaa
...@@ -396,12 +396,14 @@ def get_cookie(): ...@@ -396,12 +396,14 @@ def get_cookie():
# query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE data_batch ='202401251550270001' and ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' order by docID desc limit 10000" # query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE data_batch ='202401251550270001' and ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' order by docID desc limit 10000"
# query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and data_batch ='202403051815460001' order by id" \ # query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and data_batch ='202403051815460001' order by id" \
# f" limit 100000" # f" limit 100000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '110' limit 100000" query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '110' and (id like '%5' or id like '%6' or id like '%7' or id like '%8' or id like '%9' ) limit 100000"
query_spider = mysql_zdzs_proxy(query_spider_sql) query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0 index = 0
len_ = len(query_spider) len_ = len(query_spider)
print(f"len={len_},index={index}") print(f"len={len_},index={index}")
for row in query_spider: for row in query_spider:
# if str(row['id'])[-1] not in ["0","1","2","3","4"]:
# continue
index += 1 index += 1
print(f"len={len_},index={index}") print(f"len={len_},index={index}")
# while True: # while True:
......
...@@ -42,7 +42,7 @@ def remove_html_tags(text): ...@@ -42,7 +42,7 @@ def remove_html_tags(text):
url_data = mysql_zdzs_proxy( url_data = mysql_zdzs_proxy(
"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '110' and (id like '%0' or id like '%1' or id like '%2' or id like '%3' ) limit 10000") "SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '110' and (id like '%4' or id like '%5' or id like '%6' or id like '%7' ) limit 10000")
for row in url_data: for row in url_data:
url = row['url'] url = row['url']
keyword = url.replace('https://search.jd.com/search?keyword=', "").replace('&enc=utf-8', "") keyword = url.replace('https://search.jd.com/search?keyword=', "").replace('&enc=utf-8', "")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment