Commit 4ea23759 authored by huangziyu's avatar huangziyu

1

parent 56457489
......@@ -80,7 +80,7 @@ def get_cookie():
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -121,33 +121,37 @@ def get_cookie():
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
# 登录成功
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%0' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%0' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%1' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%1' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%2' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%2' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%3' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%3' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%4' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%4' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%5' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%5' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%6' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%6' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%7' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%7' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%8' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%8' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -2,6 +2,7 @@
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from selenium.webdriver.common.keys import Keys
from urllib import request
import cv2
import numpy as np
......@@ -68,16 +69,19 @@ def mysql_zdzs_proxy(sql=None):
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
time.sleep(0.7)
print(f"清空文本框开始")
driver.find_element(by=By.ID, value='loginname').clear()
time.sleep(0.7)
driver.find_element(by=By.ID, value='nloginpwd').clear()
time.sleep(1.7)
print(f"清空文本框结束")
# 获取输入框中的现有内容的长度
content_length = len(driver.find_element(by=By.ID, value='loginname').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='loginname').send_keys(Keys.BACK_SPACE)
content_length = len(driver.find_element(by=By.ID, value='nloginpwd').get_attribute('value'))
# 逐个删除所有字符
for _ in range(content_length):
driver.find_element(by=By.ID, value='nloginpwd').send_keys(Keys.BACK_SPACE)
print(f"清空文本框成功")
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
# 登录
print(f"点击登录开始")
driver.find_element(by=By.ID, value='loginsubmit').click()
......@@ -87,7 +91,7 @@ def get_cookie(username=None,password=None,driver=None):
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' and id like '%9' limit 30000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' and id like '%9' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -124,33 +128,37 @@ def get_cookie(username=None,password=None,driver=None):
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
pass
return
......@@ -164,7 +172,8 @@ if __name__ == "__main__":
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
driver.set_window_size(1200, 600)
# get_cookie(username=config.account_list[0]['username'],password=config.account_list[0]['password'],driver=driver)
while True:
for account in config.account_list:
username = account['username']
......
......@@ -361,7 +361,7 @@ def get_cookie():
# query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and data_batch ='202403051815460001' order by id" \
# f" limit 100000"
# query_spider_sql = f"-- SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '41' and (id like '%5' or id like '%6' or id like '%7' or id like '%8' or id like '%9' ) limit 10000"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE data_batch ='202404180000000001' and ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD'"
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000002' "
query_spider = mysql_zdzs_proxy(query_spider_sql)
index = 0
len_ = len(query_spider)
......@@ -573,33 +573,37 @@ def get_cookie():
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
try:
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
except Exception as e:
print(e)
print("保存代码异常")
continue
print("保存成功")
updateCookie(driver)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment