验证码识别程序设计流程

以南邮正方教务系统为例

找到验证码图片的链接

“http://jwxt.njupt.edu.cnCheckCode.aspx”

def getCaptcha():
    CAPTCHA_URL = "http://jwxt.njupt.edu.cn/CheckCode.aspx"

    r = requests.get(CAPTCHA_URL)
    img = r.content
    img = Image.open(BytesIO(img))
    return img

对图片去噪

def image_pre_process(self):
    # 去除杂色点
    for x in range(self.image.width):
        for y in range(self.image.height):
            pix = self.image.getpixel((x, y))
            if pix == 43:
                self.image.putpixel((x, y), WHITE)
                else:
                    self.image.putpixel((x, y), BLACK)

                    # 去除单像素噪点并进行二值化(八值法)
                    for x in range(self.image.width):
                        for y in range(self.image.height):
                            count = 0
                            if x != 0 and y != 0 and x != self.image.width - 1 and y != self.image.height - 1:
                                for i in range(-1, 2):
                                    for j in range(-1, 2):
                                        tx = x + i
                                        ty = y + j
                                        if self.image.getpixel((tx, ty)) == BLACK:
                                            count += 1
                                            if self.image.getpixel((x, y)) == WHITE and count == 8:
                                                # 如果一个白色区域的附近八连通区域都是黑色,那么该点也认为是黑色
                                                self.image.putpixel((x, y), BLACK)
                                                self.image = self.image.convert('1')

△二值化:将需要识别的蓝色字符(43)转为白色(WHITE),其他的全部变为黑色(BLACK)

二值化的过程需要灵性一点,不能直接用convert函数,那样的效果并不理想。

直接convert的效果:

直接二值化

先对蓝色43进行二值化，然后去单点噪声、最后二值化的效果：

去噪再二值化

设计打码程序设计

"""
方便收集验证码的简单打码工具
"""
import tkinter
from io import BytesIO

import requests
from PIL import Image, ImageTk
from bs4 import BeautifulSoup

class CaptchaGUI:
    captcha_url = "http://jwxt.njupt.edu.cn/CheckCode.aspx"
    login_url = "http://jwxt.njupt.edu.cn/default2.aspx"

    def __init__(self):

        self.s = requests.session()
        self.get_viewstate()
        self.r = self.s.get(self.captcha_url)
        self.im = Image.open(BytesIO(self.r.content))

        self.root = tkinter.Tk()
        self.tkimg = ImageTk.PhotoImage(self.im)
        self.imgLabel = tkinter.Label(self.root, image=self.tkimg)
        self.imgLabel.pack()
        self.message = tkinter.Entry(self.root)
        self.message.pack()
        self.root.bind('<Return>', self.judge_and_save)
        self.root.mainloop()

    def get_viewstate(self):
        r = self.s.get(self.login_url)
        soup = BeautifulSoup(r.content, "lxml")
        self.viewstate = soup.find('input', attrs={"name": "__VIEWSTATE"}).get("value")

    def judge_and_save(self, event):
        captcha_value = self.message.get()
        print(captcha_value)
        data = {
            "__VIEWSTATE": self.viewstate,
            'txtUserName': "",  # 账号
            'TextBox2': "",  # 密码
            'RadioButtonList1': "%D1%A7%C9%FA",
            "Button1": "",
            "txtSecretCode": captcha_value,
            "hidPdrs": "",
            "hidsc": ""
        }

        r = self.s.post(self.login_url, data=data)
        if "请到信息维护中完善个人联系方式" in r.text:
            print("成功！")
            with open("captchas/{}.gif".format(captcha_value), 'wb+') as f:
                f.write(self.r.content)
        else:
            print("验证码输错了")
        self.get_viewstate()
        self.r = self.s.get(self.captcha_url)
        self.im = Image.open(BytesIO(self.r.content))
        self.tkimg = ImageTk.PhotoImage(self.im)
        self.imgLabel.config(image=self.tkimg)
        self.message.delete(0, 'end')


if __name__ == "__main__":
    captcha_gui = CaptchaGUI()

将结果分割到每个字符

def handle_split_image(self):
    # 切割验证码，返回包含四个字符图像的列表
    y_min, y_max = 0, 22
    split_lines = [5, 17, 29, 41, 53]
    ims = [rotate_img(self.image.crop([u, y_min, v, y_max])) for u, v in zip(split_lines[:-1], split_lines[1:])]
    return ims

def spilt2chars():
    """
    分割已有的数据为字符并保存
    """
    try:
        shutil.rmtree('captcha_chars')
    except:
        pass
    os.mkdir("captcha_chars")
    values = "abcdefghijklmnopqrstuvwxyz1234567890"
    for value in values:
        os.mkdir('captcha_chars/{}'.format(value))

    file_names = os.listdir('captchas')
    for file_name in file_names:  #
        if not os.path.isdir(file_name) and file_name != '.DS_Store':
            values = file_name[:4]
            im = Image.open('captchas/{}'.format(file_name))
            captcha = ZhengfangCaptcha(im)
            # 用的是去噪、二值化的图片
            for im_part, value in zip(captcha.handle_split_image(), values):
                m = hashlib.md5()
                m.update("{}{}".format(time.time(), value).encode('utf8'))
                im_part.save("captcha_chars/{}/{}.png".format(value, m.hexdigest()))

保存模型数据

if __name__ == "__main__":
    # spilt2chars()
    letters = list('qwertyuiopasdfghjklzcxvbnm1234567890')
    # 将图像数据转为向量数据并保存
    imageset = []
    for letter in letters:
        try:
            for img in os.listdir('captcha_chars/{}/'.format(letter)):
                if img != "Thumbs.db" and img != ".DS_Store":
                    vector = ZhengfangCaptcha.buildvector(Image.open("captcha_chars/{}/{}".format(letter, img)))
                    imageset.append({letter: vector})
        except FileNotFoundError as e:
            pass

    with open('image_data.json', 'w') as f:
        json.dump(imageset, f)

识别

辅助函数:

import math
import operator
from functools import reduce

BLACK = 0
WHITE = 255


def magnitude(concordance):
    total = reduce(operator.add, map(lambda x: x ** 2, concordance))
    return math.sqrt(total)


# 计算矢量之间的余弦距离
def distance_cos(vector1, vector2):
    sum_value = 0
    for value1, value2 in zip(vector1, vector2):
        sum_value += value1 * value2
    return 1 - sum_value / (magnitude(vector1) * magnitude(vector2))


def distance_hanmming(vector1, vector2):
    """
    计算两向量的汉明距，（向量只包含0，1时）
    """
    count = 0
    for value1, value2 in zip(vector1, vector2):
        if value1 != value2:
            count += 1
    return count


def build_vector(image, binary=True):
    """
    图像转一维特征向量
    :param image: pillow Image object with mode 1 or mode L
    :param binary: 黑白图是否生成为0，1向量
    :return: list of int
    """
    vector = []
    for pixel in image.getdata():
        if binary:
            vector.append(1 if pixel == 255 else 0)
        else:
            vector.append(pixel)
    return vector


def rotate_img(image):
    """
    根据图像在x轴方向投影大小确定字符的摆放方向
    :param image: PIL.Image object
    :return: rotated Image object
    """
    min_count = 1000
    final_angle = 0
    for angle in range(-45, 45):
        x_count = 0
        ti = image.rotate(angle, expand=True)
        for x in range(ti.width):
            for y in range(ti.height):
                if ti.getpixel((x, y)) == WHITE:
                    x_count += 1
                    break
        if x_count < min_count:
            min_count = x_count
            final_angle = angle
    image = image.rotate(final_angle, expand=False)
    return image

识别部分:

class xxx(object):
    """
    knn 识别正方验证码
    """
    def __init__(self, image):
        self.image = image
        self.image_pre_process()
        # 初始化时就对图片进行去噪和二值化

    def crack(self):
        result = []
        # 加载数据
        with open(os.path.join(current_dir, 'image_data.json'), 'rb') as f:
            image_data = json.load(f)
            for letter in self.handle_split_image():
                letter_vector = build_vector(letter)
                guess = []
                for image in image_data:
                    for x, y in image.items():
                        guess.append((distance_hanmming(y, letter_vector), x))
                        guess.sort()
                        neighbors = guess[:15]  # 距离最近的十五个向量
                        class_votes = {}  # 投票
                        for neighbor in neighbors:
                            class_votes.setdefault(neighbor[-1], 0)
                            class_votes[neighbor[-1]] += 1
                            sorted_votes = sorted(class_votes.items(), key=lambda x: x[1], reverse=True)
                            result.append(sorted_votes[0][0])
                            return ''.join(result)

△.在对象的识别上,一定要保证两者的统一性。而因为进行预处理的图片识别率更高。即模型需要保存去噪、二值化的数据，识别的图片在识别前也要做相应的处理。