Commit 4b70bcb7 authored by huangiyu's avatar huangiyu

初始化

parents
Pipeline #474 canceled with stages
# 这是一个示例 Python 脚本。
# 按 Shift+F10 执行或将其替换为您的代码。
# 按 双击 Shift 在所有地方搜索类、文件、工具窗口、操作和设置。
def print_hi(name):
# 在下面的代码行中使用断点来调试脚本。
print(f'Hi, {name}') # 按 Ctrl+F8 切换断点。
# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
print_hi('PyCharm')
# 访问 https://www.jetbrains.com/help/pycharm/ 获取 PyCharm 帮助
This source diff could not be displayed because it is too large. You can view the blob instead.
import time
from urllib import request
import cv2
import numpy as np
import pyautogui
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ChromeOptions
import pymysql
from lxml import etree
# 登陆账号
from selenium.webdriver.common.by import By
import openpyxl
import time
import random
import json
from openpyxl import Workbook
from PIL import Image
import re
from bs4 import BeautifulSoup
# from utils.spider import jd
mysql_zdzs_proxy_host = '59.110.219.171'
mysql_zdzs_proxy_user = 'zgcindex'
mysql_zdzs_proxy_password = 'zgcprice2019'
mysql_zdzs_proxy_database = 'zdzs_proxy'
username = 'xxx'
# 登陆密码
password = 'xxx'
# mysql_zdzs_proxy_host = '59.110.219.171'
# mysql_zdzs_proxy_user = 'zgcindex'
# mysql_zdzs_proxy_password = 'zgcprice2019'
# mysql_zdzs_proxy_database = 'zdzs_proxy'
# 滑块距离屏幕左上角的x, y像素, 需根据自己屏幕大小调整,利用微信截图得到xy的具体位置
# 登录的
slide_x_position, slid_y_position = 850, 459
# 验证的
slide_x_position_quick_verification, slid_y_position_quick_verification = 490, 512
# 偏移度加值(根据电脑分辨率情况)
offset_increase = 0
while True:
try:
conn = pymysql.connect(
host=mysql_zdzs_proxy_host,
user=mysql_zdzs_proxy_user,
password=mysql_zdzs_proxy_password,
database=mysql_zdzs_proxy_database,
charset="utf8mb4",
cursorclass=pymysql.cursors.DictCursor,
autocommit=True
)
break
except Exception as e:
print(e)
time.sleep(5)
pass
cur = conn.cursor()
def mysql_zdzs_proxy(sql=None):
print(sql)
try:
cur.execute(sql)
result = cur.fetchall()
return result
except pymysql.err.IntegrityError as e:
# cur.close()
return []
except Exception as e:
return []
def return_js():
return open('stealth.min.js', 'r').read()
class SlideUtils:
@staticmethod
def find_pic(background, slide):
"""
获取背景图与滑块图的最佳位置
"""
# 读取图片
background_rgb = cv2.imread(background)
# 灰度处理
background_gray = cv2.cvtColor(background_rgb, cv2.COLOR_BGR2GRAY)
# 读取滑块灰度图片
slide_gray = cv2.imread(slide, 0)
# 匹配滑块位置
res = cv2.matchTemplate(background_gray, slide_gray, cv2.TM_CCOEFF_NORMED)
# 获取最佳与最差匹配
value = cv2.minMaxLoc(res)
return value[2][0]
@staticmethod
def slide_by_pyautogui(x, y, offset, offset_increase):
"""
使用pyautogui实现滑块并自定义轨迹方程
"""
print(f"睡眠0.5秒后点击移动")
time.sleep(0.5)
xx = x + offset + offset_increase
pyautogui.moveTo(x, y, duration=0.1)
pyautogui.mouseDown()
y += random.randint(9, 19)
pyautogui.moveTo(x + int(offset * random.randint(15, 23) / 20), y, duration=0.28)
y += random.randint(-9, 0)
pyautogui.moveTo(x + int(offset * random.randint(17, 21) / 20), y, duration=random.randint(20, 31) / 100)
y += random.randint(0, 8)
pyautogui.moveTo(xx, y, duration=0.3)
pyautogui.mouseUp()
@staticmethod
def slide_by_pyautogui2(x, y, offset, offset_increase):
"""
使用pyautogui实现滑块并自定义轨迹方程
"""
# print(f"睡眠2秒后点击移动")
# time.sleep(2)
xx = x + offset + offset_increase
pyautogui.moveTo(x, y, duration=0.1)
pyautogui.mouseDown()
y += random.randint(9, 19)
pyautogui.moveTo(x + int(offset * random.randint(15, 23) / 20), y, duration=0.28)
y += random.randint(-9, 0)
pyautogui.moveTo(x + int(offset * random.randint(17, 21) / 20), y, duration=random.randint(20, 31) / 100)
y += random.randint(0, 8)
pyautogui.moveTo(xx, y, duration=0.3)
# print("睡眠2秒后松开")
# time.sleep(0.3)
pyautogui.mouseUp()
# print(f"睡眠2秒后点击移动")
# time.sleep(2)
# xx = x + offset
# pyautogui.moveTo(x, y, duration=0.1)
# pyautogui.mouseDown()
# # y += random.randint(9, 19)
# # pyautogui.moveTo(x + int(offset * random.randint(15, 23) / 20), y, duration=0.28)
# # y += random.randint(-9, 0)
# # pyautogui.moveTo(x + int(offset * random.randint(17, 21) / 20), y, duration=random.randint(20, 31) / 100)
# # y += random.randint(0, 8)
# pyautogui.moveTo(xx, y, duration=0.3)
# print("睡眠2秒后松开")
# time.sleep(2)
# pyautogui.mouseUp()
def detect_circular_distortion(image_path, circle_radius=65):
# 加载图像并转换为灰度图
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 确定圆形区域的中心和半径
circle_center = (gray.shape[1] // 2, gray.shape[0] // 2)
# 创建一个遮罩,只包含圆形区域
mask = np.zeros_like(gray)
cv2.circle(mask, circle_center, circle_radius, 255, thickness=-1)
# 应用遮罩到图像
masked_img = cv2.bitwise_and(gray, gray, mask=mask)
# 初始化ORB检测器
orb = cv2.ORB_create(500)
# 检测并计算圆形区域和整个图像的关键点和描述符
kp1, des1 = orb.detectAndCompute(gray, None)
kp2, des2 = orb.detectAndCompute(masked_img, None)
# 匹配描述符
bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
matches = bf.match(des1, des2)
# 计算匹配点的平均距离,作为扭曲程度的一个简单估计
if matches:
distances = [match.distance for match in matches]
average_distance = sum(distances) / len(distances)
else:
average_distance = float('inf')
return average_distance
def find_best_match_angle(background_img_path, rotating_img_path):
# 加载背景图片和验证码图片
background = Image.open(background_img_path)
captcha = Image.open(rotating_img_path)
# 确保背景图和验证码图的尺寸符合要求
assert background.size == (320, 200), "背景图尺寸必须是 320x200"
assert captcha.size == (130, 130), "验证码图尺寸必须是 130x130"
# 确定将验证码放置在背景图中的位置
insert_position = ((background.width - captcha.width) // 2, (background.height - captcha.height) // 2)
best_score = None
best_angle = None
best_position = None
best_file_name = None
# 每10度旋转一次,总共旋转36次
for angle in range(-360, 0, 1):
# 旋转验证码图片
rotated_captcha = captcha.rotate(angle, expand=False)
# 创建新的背景图副本用于合成
new_background = background.copy()
# 将旋转后的验证码图片放置到背景图的指定位置
new_background.paste(rotated_captcha, insert_position, rotated_captcha)
# 保存合成后的图片
new_background.save(f"captcha_{abs(angle)}.png")
distortion_level = detect_circular_distortion(f"captcha_{abs(angle)}.png")
if angle == -360:
best_score = distortion_level
print(f'中心区域扭曲程度估计(数值越小表示越接近未扭曲): {distortion_level}')
# res = cv2.matchTemplate(background, rotated_captcha, cv2.TM_CCOEFF_NORMED)
# _, max_val, _, max_loc = cv2.minMaxLoc(res)
#
if best_score > distortion_level:
best_score = distortion_level
best_angle = angle
# best_position = max_loc
best_file_name = f"captcha_{abs(angle)}.png"
print(best_score)
print(best_angle)
print(best_position)
print(best_file_name)
return abs(best_angle), best_position
def get_cookie():
option = ChromeOptions()
option.binary_location = 'D:\ChromeMaYi\guge\chrome.exe'
# 这个不能开,开了识别不了图片
# option.add_argument('--headless') # 无头模式,可不启用界面显示运行
option.add_experimental_option('excludeSwitches', ['enable-automation'])
driver = webdriver.Chrome(options=option)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": return_js()
})
option.add_argument("--disable-blink-features=AutomationControlled")
driver.set_window_size(1200, 700)
# driver.maximize_window()
driver.get('https://passport.jd.com/uc/login')
# 切換账号密码登录
# driver.find_element(by=By.CLASS_NAME, value='login-tab-r').click()
# 设置账号密码
driver.find_element(by=By.ID, value='loginname').send_keys(username)
driver.find_element(by=By.ID, value='nloginpwd').send_keys(password)
time.sleep(0.5)
# 登录
driver.find_element(by=By.ID, value='loginsubmit').click()
print("睡眠0.5秒后开始滑动登录")
time.sleep(0.5)
# 一直循环直到登录成功位置,超过一百次就算失败。
count = 0
while 1:
count = count + 1
print(f"开始第{count}次尝试")
if count > 100:
print("登录京东失败!")
return
# 获取验证码图片
# 用于找到登录图片的大图
try:
background = driver.find_element(by=By.XPATH, value=r'//div/div[@class="JDJRV-bigimg"]/img')
except NoSuchElementException:
# 未查找到登陆图片则认为成功
print("京东登录成功!")
break
# 用来找到登录图片的小滑块
slide = driver.find_element(by=By.XPATH, value=r'//div/div[@class="JDJRV-smallimg"]/img')
background_url = background.get_attribute("src")
slide_url = slide.get_attribute("src")
background_img = 'background_img.png'
slide_img = 'slide_img.png'
# 下载背景大图保存到本地
request.urlretrieve(background_url, background_img)
# 下载滑块保存到本地
request.urlretrieve(slide_url, slide_img)
# 获取最佳x偏移量
x = SlideUtils.find_pic(background_img, slide_img)
# print(f'本地最佳偏移量: {x}')
# 计算缩放
# 获取下载背景图宽度
w1 = cv2.imread(background_img).shape[1]
# 获取网页背景图宽度
w2 = background.size['width']
# 计算实际页面x偏移量
x = (x * w2 / w1)
# 其中x为屏幕左上角至滑块中心的横向像素值,y为屏幕左上角至滑块中心纵向像素值, 可根据自己屏幕配置
SlideUtils.slide_by_pyautogui(slide_x_position, slid_y_position, x, offset_increase)
time.sleep(3)
query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' and project_id = '110' order by docID desc limit 10000"
# query_spider_sql = f"SELECT id,url,mallId FROM DMP_SEARCH_SPIDER WHERE data_batch ='202401251550270001' and ID NOT IN (SELECT ID FROM DMP_SEARCH_DATA) and mallId = 'DS-JD' order by docID desc limit 10000"
query_spider = mysql_zdzs_proxy(query_spider_sql)
for row in query_spider:
# while True:
# row = {
# 'url':'https://search.jd.com/search?keyword=COMIX%20B3749&enc=utf-8',
# 'url':'https://cfe.m.jd.com/privatedomain/risk_handler/03101900/?returnurl=http%3A%2F%2Fsearch.jd.com%2Fsearch%3Fkeyword%3DCOMIX%2520B3749%26enc%3Dutf-8&evtype=2&rpid=rp-191349676-10509-17059999771768'
# }
url = row['url']
print(url)
driver.get(url)
time.sleep(1)
text = driver.page_source
while ('快速验证' in text and '验证一下,购物无忧' in text) or '前方拥挤,请刷新重试' in text:
driver.get(url)
time.sleep(1)
pass
# 获取验证码图片
try:
print("需要验证。")
print("睡眠1秒后点击“快速验证”按钮")
driver.find_element(by=By.CLASS_NAME, value='verifyBtn').click()
time.sleep(2)
background = driver.find_element(by=By.XPATH, value='//*[@id="cpc_img"]')
background_url = background.get_attribute("src")
# 用来找到登录图片的小滑块
slide = driver.find_element(by=By.XPATH, value='//*[@id="small_img"]')
slide_url = slide.get_attribute("src")
background_img = 'background_img.png'
slide_img = 'slide_img.png'
# 下载背景大图保存到本地
request.urlretrieve(background_url, background_img)
# 下载滑块保存到本地
request.urlretrieve(slide_url, slide_img)
# 获取最佳x偏移量
x = SlideUtils.find_pic(background_img, slide_img)
# print(f'本地最佳偏移量: {x}')
# 计算缩放
# 获取下载背景图宽度
w1 = cv2.imread(background_img).shape[1]
# 获取网页背景图宽度
w2 = background.size['width']
# 计算实际页面x偏移量
x = (x * w2 / w1)
# 其中x为屏幕左上角至滑块中心的横向像素值,y为屏幕左上角至滑块中心纵向像素值, 可根据自己屏幕配置
# time.sleep(0.5)
SlideUtils.slide_by_pyautogui(slide_x_position_quick_verification, slid_y_position_quick_verification,
x, offset_increase+5)
time.sleep(3)
text = driver.page_source
continue
except NoSuchElementException:
print("未查找到验证图片背景图,可能是旋转图片,换一种方式获取div")
try:
background = driver.find_element(by=By.XPATH, value='//*[@id="img-back-div"]')
background_url = background.get_attribute("style").replace('"); height: 181px;', "").replace(
'background-image: url("', "")
# 用来找到登录图片的小滑块
slide = driver.find_element(by=By.XPATH, value='//*[@id="img-rotate-div"]/img')
slide_url = slide.get_attribute("src")
background_img = 'background_img.png'
slide_img = 'slide_img.png'
# 下载背景大图保存到本地
request.urlretrieve(background_url, background_img)
# 下载滑块保存到本地
request.urlretrieve(slide_url, slide_img)
best_angle, best_match_score = find_best_match_angle(background_img, slide_img)
print(f"最佳匹配角度为: {best_angle} 度")
# time.sleep(0.5)
# w1 = cv2.imread(background_img).shape[1]
# # 获取网页背景图宽度
# w2 = background.size['width']
x = 228 / 360 * best_angle
x = int(x)
print(f"移动: {x} ")
SlideUtils.slide_by_pyautogui2(slide_x_position_quick_verification,
slid_y_position_quick_verification,
x,
offset_increase)
time.sleep(3)
text = driver.page_source
except NoSuchElementException:
print("未查找到验证图片背景图,睡眠1秒后重试。")
driver.get(url)
text = driver.page_source
continue
except Exception as e:
print(e)
text = driver.page_source
continue
except Exception as e:
print(e)
text = driver.page_source
continue
# 判断是否跳转了登录
if '<title>京东-欢迎登录</title>' in text:
return get_cookie()
Response = etree.HTML(text)
dataList = []
for x in Response.xpath("//div[@id='J_goodsList']/ul/li"):
is_self_operate_div = ''
if x.xpath(".//div[@class='p-name p-name-type-2']"):
shop_name = x.xpath(".//span[@class='J_im_icon']/a/@title")[0] if x.xpath(
".//span[@class='J_im_icon']/a/@title") else ''
dataList.append(
{
'is_self_operate_div': is_self_operate_div,
'item_url': 'https://item.jd.com/' + str(x.xpath("./@data-sku")[0]) + '.html',
'sku_id': str(x.xpath("./@data-sku")[0]),
'shop_name': shop_name,
'sku_name': '‘'.join(
''.join(x.xpath(".//div[@class='p-name p-name-type-2']//em//text()")).split(
"'")).strip(),
'sku_price': ''
}
)
insert_sql = f"""
INSERT INTO DMP_SEARCH_DATA(id,result,mallId)
VALUES ({row['id']},'{json.dumps(dataList, ensure_ascii=False).replace("'", "''")}','{row['mallId']}')
"""
mysql_zdzs_proxy(insert_sql)
print("保存成功")
pass
# 登录成功
cookie = ''
for i in driver.get_cookies():
cookie = cookie + i['name'] + '=' + i['value'] + ';'
driver.quit()
return cookie
if __name__ == "__main__":
print(get_cookie())
# print(find_best_match_angle('background_img.png', 'slide_img.png'))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment