Mrli
别装作很努力,
因为结局不会陪你演戏。
Contacts:
QQ博客园

验证码识别程序设计流程

2019/10/07 爬虫 Python
Word count: 1,540 | Reading time: 8min

验证码识别程序设计流程

南邮正方教务系统为例

找到验证码图片的链接

http://jwxt.njupt.edu.cnCheckCode.aspx

1
2
3
4
5
6
7
def getCaptcha():
CAPTCHA_URL = "http://jwxt.njupt.edu.cn/CheckCode.aspx"

r = requests.get(CAPTCHA_URL)
img = r.content
img = Image.open(BytesIO(img))
return img

对图片去噪

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
def image_pre_process(self):
# 去除杂色点
for x in range(self.image.width):
for y in range(self.image.height):
pix = self.image.getpixel((x, y))
if pix == 43:
self.image.putpixel((x, y), WHITE)
else:
self.image.putpixel((x, y), BLACK)

# 去除单像素噪点并进行二值化(八值法)
for x in range(self.image.width):
for y in range(self.image.height):
count = 0
if x != 0 and y != 0 and x != self.image.width - 1 and y != self.image.height - 1:
for i in range(-1, 2):
for j in range(-1, 2):
tx = x + i
ty = y + j
if self.image.getpixel((tx, ty)) == BLACK:
count += 1
if self.image.getpixel((x, y)) == WHITE and count == 8:
# 如果一个白色区域的附近八连通区域都是黑色,那么该点也认为是黑色
self.image.putpixel((x, y), BLACK)
self.image = self.image.convert('1')

△二值化:将需要识别的蓝色字符(43)转为白色(WHITE),其他的全部变为黑色(BLACK)

  • 二值化的过程需要灵性一点,不能直接用convert函数,那样的效果并不理想。

直接convert的效果:

直接二值化

先对蓝色43进行二值化,然后去单点噪声、最后二值化的效果:

去噪再二值化

设计打码程序设计

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""
方便收集验证码的简单打码工具
"""
import tkinter
from io import BytesIO

import requests
from PIL import Image, ImageTk
from bs4 import BeautifulSoup

class CaptchaGUI:
captcha_url = "http://jwxt.njupt.edu.cn/CheckCode.aspx"
login_url = "http://jwxt.njupt.edu.cn/default2.aspx"

def __init__(self):

self.s = requests.session()
self.get_viewstate()
self.r = self.s.get(self.captcha_url)
self.im = Image.open(BytesIO(self.r.content))

self.root = tkinter.Tk()
self.tkimg = ImageTk.PhotoImage(self.im)
self.imgLabel = tkinter.Label(self.root, image=self.tkimg)
self.imgLabel.pack()
self.message = tkinter.Entry(self.root)
self.message.pack()
self.root.bind('<Return>', self.judge_and_save)
self.root.mainloop()

def get_viewstate(self):
r = self.s.get(self.login_url)
soup = BeautifulSoup(r.content, "lxml")
self.viewstate = soup.find('input', attrs={"name": "__VIEWSTATE"}).get("value")

def judge_and_save(self, event):
captcha_value = self.message.get()
print(captcha_value)
data = {
"__VIEWSTATE": self.viewstate,
'txtUserName': "", # 账号
'TextBox2': "", # 密码
'RadioButtonList1': "%D1%A7%C9%FA",
"Button1": "",
"txtSecretCode": captcha_value,
"hidPdrs": "",
"hidsc": ""
}

r = self.s.post(self.login_url, data=data)
if "请到信息维护中完善个人联系方式" in r.text:
print("成功!")
with open("captchas/{}.gif".format(captcha_value), 'wb+') as f:
f.write(self.r.content)
else:
print("验证码输错了")
self.get_viewstate()
self.r = self.s.get(self.captcha_url)
self.im = Image.open(BytesIO(self.r.content))
self.tkimg = ImageTk.PhotoImage(self.im)
self.imgLabel.config(image=self.tkimg)
self.message.delete(0, 'end')


if __name__ == "__main__":
captcha_gui = CaptchaGUI()

将结果分割到每个字符

1
2
3
4
5
6
def handle_split_image(self):
# 切割验证码,返回包含四个字符图像的列表
y_min, y_max = 0, 22
split_lines = [5, 17, 29, 41, 53]
ims = [rotate_img(self.image.crop([u, y_min, v, y_max])) for u, v in zip(split_lines[:-1], split_lines[1:])]
return ims
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def spilt2chars():
"""
分割已有的数据为字符并保存
"""
try:
shutil.rmtree('captcha_chars')
except:
pass
os.mkdir("captcha_chars")
values = "abcdefghijklmnopqrstuvwxyz1234567890"
for value in values:
os.mkdir('captcha_chars/{}'.format(value))

file_names = os.listdir('captchas')
for file_name in file_names: #
if not os.path.isdir(file_name) and file_name != '.DS_Store':
values = file_name[:4]
im = Image.open('captchas/{}'.format(file_name))
captcha = ZhengfangCaptcha(im)
# 用的是去噪、二值化的图片
for im_part, value in zip(captcha.handle_split_image(), values):
m = hashlib.md5()
m.update("{}{}".format(time.time(), value).encode('utf8'))
im_part.save("captcha_chars/{}/{}.png".format(value, m.hexdigest()))

保存模型数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
if __name__ == "__main__":
# spilt2chars()
letters = list('qwertyuiopasdfghjklzcxvbnm1234567890')
# 将图像数据转为向量数据并保存
imageset = []
for letter in letters:
try:
for img in os.listdir('captcha_chars/{}/'.format(letter)):
if img != "Thumbs.db" and img != ".DS_Store":
vector = ZhengfangCaptcha.buildvector(Image.open("captcha_chars/{}/{}".format(letter, img)))
imageset.append({letter: vector})
except FileNotFoundError as e:
pass

with open('image_data.json', 'w') as f:
json.dump(imageset, f)

识别

辅助函数:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import math
import operator
from functools import reduce

BLACK = 0
WHITE = 255


def magnitude(concordance):
total = reduce(operator.add, map(lambda x: x ** 2, concordance))
return math.sqrt(total)


# 计算矢量之间的余弦距离
def distance_cos(vector1, vector2):
sum_value = 0
for value1, value2 in zip(vector1, vector2):
sum_value += value1 * value2
return 1 - sum_value / (magnitude(vector1) * magnitude(vector2))


def distance_hanmming(vector1, vector2):
"""
计算两向量的汉明距,(向量只包含0,1时)
"""
count = 0
for value1, value2 in zip(vector1, vector2):
if value1 != value2:
count += 1
return count


def build_vector(image, binary=True):
"""
图像转一维特征向量
:param image: pillow Image object with mode 1 or mode L
:param binary: 黑白图是否生成为0,1向量
:return: list of int
"""
vector = []
for pixel in image.getdata():
if binary:
vector.append(1 if pixel == 255 else 0)
else:
vector.append(pixel)
return vector


def rotate_img(image):
"""
根据图像在x轴方向投影大小确定字符的摆放方向
:param image: PIL.Image object
:return: rotated Image object
"""
min_count = 1000
final_angle = 0
for angle in range(-45, 45):
x_count = 0
ti = image.rotate(angle, expand=True)
for x in range(ti.width):
for y in range(ti.height):
if ti.getpixel((x, y)) == WHITE:
x_count += 1
break
if x_count < min_count:
min_count = x_count
final_angle = angle
image = image.rotate(final_angle, expand=False)
return image

识别部分:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
class xxx(object):
"""
knn 识别正方验证码
"""
def __init__(self, image):
self.image = image
self.image_pre_process()
# 初始化时就对图片进行去噪和二值化

def crack(self):
result = []
# 加载数据
with open(os.path.join(current_dir, 'image_data.json'), 'rb') as f:
image_data = json.load(f)
for letter in self.handle_split_image():
letter_vector = build_vector(letter)
guess = []
for image in image_data:
for x, y in image.items():
guess.append((distance_hanmming(y, letter_vector), x))
guess.sort()
neighbors = guess[:15] # 距离最近的十五个向量
class_votes = {} # 投票
for neighbor in neighbors:
class_votes.setdefault(neighbor[-1], 0)
class_votes[neighbor[-1]] += 1
sorted_votes = sorted(class_votes.items(), key=lambda x: x[1], reverse=True)
result.append(sorted_votes[0][0])
return ''.join(result)

△.在对象的识别上,一定要保证两者的统一性。而因为进行预处理的图片识别率更高。即模型需要保存去噪、二值化的数据,识别的图片在识别前也要做相应的处理

Author: Mrli

Link: https://nymrli.top/2019/09/09/验证码识别程序设计流程/

Copyright: All articles in this blog are licensed under CC BY-NC-SA 3.0 unless stating additionally.

< PreviousPost
打jar包和使用jar包
NextPost >
新加坡游玩经验
CATALOG
  1. 1. 验证码识别程序设计流程
    1. 1.1. 找到验证码图片的链接
    2. 1.2. 对图片去噪
    3. 1.3. 设计打码程序设计
    4. 1.4. 将结果分割到每个字符
    5. 1.5. 保存模型数据
    6. 1.6. 识别