一次滑动验证码的爬虫

一次滑动验证码的爬虫

徐静

最近有人问我关于爬虫中滑块验证码的识别问题,然后尝试了一下,主要分为以下几步:

  • 步骤一:点击按钮,弹出没有缺口的图片
  • 步骤二:获取步骤一的图片
  • 步骤三:点击滑动按钮,弹出带缺口的图片
  • 步骤四:获取带缺口的图片
  • 步骤五:对比两张图片的所有RBG像素点,得到不一样像素点的x值,即要移动的距离
  • 步骤六:模拟人的行为习惯(先匀加速拖动后匀减速拖动),把需要拖动的总距离分成一段一段小的轨迹
  • 步骤七:按照轨迹拖动,完全验证
  • 步骤八:完成登录

上代码:


base_url = "http://www.geetest.com/type"


from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from PIL import Image
import time
import numpy as np

def get_snap(driver,names):
    driver.save_screenshot(names)
    page_snap_obj=Image.open(names)
    return page_snap_obj

def get_image(driver,names):
    img=driver.find_element_by_class_name('geetest_canvas_img')
    time.sleep(2)
    location=img.location
    size=img.size

    left=location['x']
    top=location['y']
    right=left+size['width']
    bottom=top+size['height']

    page_snap_obj=get_snap(driver,names)
    image_obj=page_snap_obj.crop((left,top,right,bottom))
    #image_obj.show()
    return image_obj

def get_distance(image1,image2):
    start=int(np.ceil(image2.size[0]/4))
    threhold=120

    for i in range(start,image1.size[0]):
        for j in range(image1.size[1]):
            rgb1=image1.load()[i,j]
            rgb2=image2.load()[i,j]
            res1=abs(rgb1[0]-rgb2[0])
            res2=abs(rgb1[1]-rgb2[1])
            res3=abs(rgb1[2]-rgb2[2])
            #print(res1,res2,res3)
            if not (res1 < threhold and res2 < threhold and res3 < threhold):
                return i - 4
    return i - 4

def get_tracks(distance):
    '''
    本质来源于物理学中的加速度算距离: s = vt + 1/2 at^2
                                    v = v_0 + at

    在这里:总距离S= distance+20
            加速度:前3/5S加速度2,后半部分加速度是-3

    '''
    distance+=20 #先滑过一点,最后再反着滑动回来
    v=0
    t=0.2
    forward_tracks=[]

    current=0
    mid=distance*3/5
    while current < distance:
        if current < mid:
            a=2
        else:
            a=-3

        s=v*t+0.5*a*(t**2)
        v=v+a*t
        current+=s
        forward_tracks.append(round(s))

    #反着滑动到准确位置
    back_tracks=[-3,-3,-3,-2,-2,-1,-2,-1,-1,-1] #总共等于-10

    return {'forward_tracks':list(forward_tracks),'back_tracks':back_tracks}


#判断元素是否存在
#'geetest_success_radar_tip'
def isElementExist(driver,element):
    flag=True
    browser=driver
    try:
        browser.find_element_by_class_name(element)
        return flag
    
    except:
        flag=False
        return flag


def my_scrapy():
    try:
        # 1、输入账号密码回车
        driver = webdriver.Chrome()
        driver.implicitly_wait(3)
        driver.get("http://www.geetest.com/type")


        time.sleep(1)
        driver.find_element_by_xpath('//*[@id="app"]/section/div/ul/li[2]/h2').click()
        #1获取全相
        time.sleep(0.5)
        driver.find_element_by_class_name('geetest_wait').click()
        time.sleep(1)
        image1 = get_image(driver,'before.png')

        #2获取有缺口的图像
        driver.find_element_by_xpath('/html/body/div[3]/div[2]/div[2]/div[1]/div[2]/div[2]').click()
        image2 = get_image(driver,'after.png')

        # 3对比两种图片的像素点,找出位移
        distance = get_distance(image1, image2)

        # 4模拟人的行为习惯,根据总位移得到行为轨迹
        tracks = get_tracks(distance)
        #print(tracks)

        # 5按照行动轨迹先正向滑动,后反滑动
        button = driver.find_element_by_class_name('geetest_slider_button')
        ActionChains(driver).click_and_hold(button).perform()

        #6正常滑动
        for track in tracks['forward_tracks']:
            ActionChains(driver).move_by_offset(xoffset=track, yoffset=0).perform()

        # 7返回
        time.sleep(0.5)
        for back_track in tracks['back_tracks']:
            ActionChains(driver).move_by_offset(xoffset=back_track, yoffset=0).perform()

        # 8小幅晃动模拟人操作
        ActionChains(driver).move_by_offset(xoffset=-3, yoffset=0).perform()
        ActionChains(driver).move_by_offset(xoffset=3, yoffset=0).perform()

        ActionChains(driver).move_by_offset(xoffset=2, yoffset=0).perform()
        ActionChains(driver).move_by_offset(xoffset=-2, yoffset=0).perform()

        # 9松开滑块
        time.sleep(0.5)
        ActionChains(driver).release().perform()

        #10记录本次是否验证成功
        time.sleep(3)
        if driver.find_element_by_class_name('geetest_success_radar_tip').text=='':
            success_tag = False
        else:
            success_tag = True
        return success_tag
	    
    finally:
	    driver.close()



if __name__=='__main__':

    my_success = 0
    for i in range(0,500):
        my_test = my_scrapy()
        print('[+]第'+str(i+1)+r'/500次模拟状态:'+str(my_test))
        if my_test:
            my_success += 1

    print(my_success)


Author face

徐静

数据科学从业者,数据分析师. 善于用数据科学的工具透析业务,模型的线上化部署,网络爬虫及前端可视化. 喜欢研究机器学习,深度学习及相关软件实现.目前自己还是小白一个,希望多多学习.

最近发表的文章