Commit f4cab4c0 authored by huangziyu's avatar huangziyu

1

parent 2f5289b0
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["0"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["1"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["2"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["3"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["4"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["5"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["6"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["7"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["8"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
"""
【京东搜索采集自动验证版Windows10部署文档】https://www.tapd.cn/36769433/prong/stories/view/1136769433001003855
"""
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
import config
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def get_cookie(username=None,password=None,driver=None):
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while driver.title != '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
count = count + 1
print(f"需要登录")
time.sleep(1)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '84' and data_batch ='202404180000000004' limit 30000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
query_spider_list = []
for row in query_spider:
if str(row['id'])[-1] not in ["9"]:
continue
query_spider_list.append(row)
index = 0
len_ = len(query_spider_list)
print(f"len={len_},index={index}")
if len_ == 0:
print("查询不到需要搜索的链接,结束程序。")
driver.quit()
exit()
for row in query_spider_list:
index += 1
print(f"len={len_},index={index}")
url = row['url']
print(url)
driver.get(url)
while driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print(url)
print("跳转首页了,结束,切换账号")
return
text = driver.page_source
while (
'快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text or 'class="title">加载中...</span></div></div><!----></div>' in text:
print(f"需要验证,睡眠1秒")
time.sleep(1)
text = driver.page_source
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
print("需要登录,结束,切换账号")
return
if driver.title == '京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物!':
print("验证后还是首页,结束,切换账号")
return
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': x.xpath(".//div[@class='p-price']//i//text()")[0]
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
return
if __name__ == "__main__":
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(700, 600)
while True:
for account in config.account_list:
username = account['username']
# 登陆密码
password = account['password']
print(f"切换账号:{username}")
get_cookie(username=username,password=password,driver=driver)
driver.quit()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment