±à¼ÍƼö: |
±¾ÎÄÀ´×ÔÓÚbiaodianfu£¬ÎÄÕÂÖ÷½éÉÜÁ˶þÖµ»¯¡¢×Ö·ûÇи×Ö·ûʶ±ðµÈ£¬Ê¹ÓÃKNN½øÐÐÑéÖ¤ÂëµÄʶ±ðµÄÏà¹ØÄÚÈÝ¡£ |
|
ʶ±ðÑéÖ¤ÂëµÄ·½Ê½ºÜ¶à£¬Èçtesseract¡¢SVMµÈ¡£½ñÌìÖ÷ҪѧϰµÄÊÇÈçºÎʹÓÃKNN½øÐÐÑéÖ¤ÂëµÄʶ±ð¡£
Êý¾Ý×¼±¸
±¾´ÎʵÑé²ÉÓõÄÊÇCSDNµÄÑéÖ¤Âë×öÑÝÁ·£¬Ïà¹ØµÄ½Ó¿Ú£º https://download.csdn.net/index.php/rest/tools/ validcode/source_ip_validate/10.5711163911089325
Ŀǰ½Ó¿Ú·µ»ØµÄÑéÖ¤Âë¹²2ÖÖ£º
´¿Êý×Ö¡¢¸ÉÈÅСµÄÑéÖ¤Â룬¼òµ¥½øÐÐͼƬȥ³ý±³¾°¡¢¶þÖµ»¯ºÍãÐÖµ´¦Àíºó£¬Ê¹ÓÃkNNËã·¨¼´¿Éʶ±ð¡£
×Öĸ¼ÓÊý×Ö¡¢±³¾°ÓиÉÈÅ¡¢Í¼ÐÎ×Ö·ûλÖÃÓÐÇá΢±äÐΣ¬½øÐÐͼƬȥ³ý±³¾°¡¢¶þÖµ»¯ºÍãÐÖµ´¦Àíºó£¬Ê¹ÓÃkNNË㷨ʶ±ð
ÕâÀïÑ¡ÔñµÚ¶þÖÖ½øÐÐÆÆ½â¡£ÓÉÓÚÁ½ÖÖÑéÖ¤ÂëµÄͼƬ´óС²»Ò»Ñù£¬ËùÒÔ¿ÉÒÔʹÓÃͼƬ´óСÀ´ÅжÏÄĸöÊǵÚÒ»ÖÖÑéÖ¤Â룬ÄĸöÊǵڶþÖÖÑéÖ¤Âë¡£
ÏÂÔØÑéÖ¤Âë
import requests
import uuid
from PIL import Image
import os
url = "http://download.csdn.net/index.php/rest /tools/validcode/source_ip_validate/10.5711163911089325"
for i in range(1000):
resp = requests.get(url)
filename = "./captchas/" + str(uuid.uuid4())
+ ".png"
with open(filename, 'wb') as f:
for chunk in resp.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.flush()
f.close()
im = Image.open(filename)
if im.size != (70, 25):
im.close()
os.remove(filename)
else:
print(filename) |
·Ö¸î×Ö·û
ÏÂÔØ¹ýºó£¬¾ÍÐèÒª¶Ô×Öĸ½øÐзָ·Ö¸î×Ö·û»¹ÊÇÒ»¼þ±È½ÏÂé·³µÄ¹¤×÷¡£
»Ò¶È»¯
½«²ÊÉ«µÄͼƬת»¯Îª»Ò¶ÈͼƬ£¬±ãÓÚºóÃæµÄ¶þÖµ»¯´¦Àí£¬Ê¾Àý´úÂ룺
from PIL import
Image
file = ".\\captchas\\0a4a22cd-f16b-4ae4-bc52-cdf4c081301d.png"
im = Image.open(file)
im_gray = im.convert('L')
im_gray.show() |
´¦Àíǰ£º
´¦Àíºó£º
¶þÖµ»¯
»Ò¶È»¯ÒÔºó£¬ÓÐÑÕÉ«µÄÏñËØµãΪ0-255Ö®¼äµÄÖµ¡£¶þÖµ»¯¾ÍÊǽ«´óÓÚij¸öÖµµÄÏñËØµã¶¼ÐÞ¸ÄΪ255£¬Ð¡ÓÚ¸ÃÖµµÄÐÞ¸ÄΪ0£¬Ê¾Àý´úÂ룺
from PIL import
Image
import numpy as np
file = ".\\captchas\\0a4a22cd-f16b-4ae4-bc52-cdf4c081301d.png"
im = Image.open(file)
im_gray = im.convert('L')
# im_gray.show()
pix = np.array(im_gray)
print(pix.shape)
print(pix)
threshold = 100 #ãÐÖµ
pix = (pix > threshold) * 255
print(pix)
out = Image.fromarray(pix)
out.show() |
¶þÖµ»¯Êä³öµÄ½á¹û£º
È¥³ý±ß¿ò
´Ó¶þÖµ»¯Êä³öµÄ½á¹û¿ÉÒÔ¿´µ½³ýÁË×Ö·û£¬»¹´æÔڱ߿ò£¬ÔÚÇиî×Ö·ûǰ»¹ÐèÒªÏȽ«±ß¿òÈ¥³ý¡£
border_width
= 1
new_pix = pix[border_width:-border_width,border_width:-border_width |
×Ö·ûÇиî
ÓÉÓÚ×Ö·ûÓë×Ö·û¼äûÓдæÔÚÁ¬½Ó£¬¿ÉÒÔʹÓñȽϼòµ¥µÄ¡°Í¶Ó°·¨¡±½øÐÐ×Ö·ûµÄÇиÔÀí¾ÍÊǽ«¶þÖµ»¯ºóµÄͼƬÏÈÔÚ´¹Ö±·½Ïò½øÐÐͶӰ£¬¸ù¾ÝͶӰºóµÄ¼«ÖµÀ´ÅжϷָî±ß½ç¡£·Ö¸îºóµÄСͼƬÔÙÔÚˮƽ·½Ïò½øÐÐͶӰ¡£

´úÂëʵÏÖ£º
def vertical_image(image):
height, width = image.shape
h = [0] * width
for x in range(width):
for y in range(height):
s = image[y, x]
if s == 255:
h[x] += 1
new_image = np.zeros(image.shape, np.uint8)
for x in range(width):
cv2.line(new_image, (x, 0), (x, h[x]), 255, 1)
cv2.imshow('vert_image', new_image)
cv2.waitKey()
cv2.destroyAllWindows() |
ÕûÌå´úÂë
from PIL import
Image
import cv2
import numpy as np
import os
import uuid
def clean_bg(filename):
im = Image.open(filename)
im_gray = im.convert('L')
image = np.array(im_gray)
threshold = 100 # ãÐÖµ
pix = (image > threshold) * 255
border_width = 1
new_image = pix[border_width:-border_width, border_width:-border_width]
return new_image
def get_col_rect(image):
height, width = image.shape
h = [0] * width
for x in range(width):
for y in range(height):
s = image[y, x]
if s == 0:
h[x] += 1
col_rect = []
in_line = False
start_line = 0
blank_distance = 1
for i in range(len(h)):
if not in_line and h[i] >= blank_distance:
in_line = True
start_line = i
elif in_line and h[i] < blank_distance:
rect = (start_line, i)
col_rect.append(rect)
in_line = False
start_line = 0
return col_rect
def get_row_rect(image):
height, width = image.shape
h = [0] * height
for y in range(height):
for x in range(width):
s = image[y, x]
if s == 0:
h[y] += 1
in_line = False
start_line = 0
blank_distance = 1
row_rect = (0, 0)
for i in range(len(h)):
if not in_line and h[i] >= blank_distance:
in_line = True
start_line = i
elif in_line and i == len(h)-1:
row_rect = (start_line, i)
elif in_line and h[i] < blank_distance:
row_rect = (start_line, i)
break
return row_rect
def get_block_image(image, col_rect):
col_image = image[0:image.shape[0], col_rect[0]:col_rect[1]]
row_rect = get_row_rect(col_image)
if row_rect[1] != 0:
block_image = image[row_rect[0]:row_rect[1], col_rect[0]:col_rect[1]]
else:
block_image = None
return block_image
def clean_bg(filename):
im = Image.open(filename)
im_gray = im.convert('L')
image = np.array(im_gray)
threshold = 100 # ãÐÖµ
pix = (image > threshold) * 255
border_width = 2
new_image = pix[border_width:-border_width, border_width:-border_width]
return new_image
def split(filename):
image = clean_bg(filename)
col_rect = get_col_rect(image)
for cols in col_rect:
block_image = get_block_image(image, cols)
if block_image is not None:
new_image_filename = 'letters/' + str(uuid.uuid4())
+ '.png'
cv2.imwrite(new_image_filename, block_image)
if __name__ == '__main__':
for filename in os.listdir('captchas'):
current_file = 'captchas/' + filename
split(current_file)
print('split file:%s' % current_file) |
Êý¾Ý¼¯×¼±¸
ÔÚÍê³ÉͼÏñÇиîºó£¬ÐèÒª×ö½«ÇзֵÄ×Öĸ½¨Á¢ÓɱêÇ©µÄÑù±¾¡£¼´½«ÇзֺóµÄ×Ö·ûÊáÀíµ½ÕýÈ·µÄ·ÖÀàÖС£±È½Ï³£¼ûµÄ·½Ê½ÊÇÈ˹¤ÊáÀí¡£
ÓÉÓÚͼÏñ±È½Ï¶à£¬ÕâÀïʹÓÃʹÓÃTesseract-OCR½øÐÐʶ±ð¡£
¹Ù·½ÏîÄ¿µØÖ·£ºhttps://github.com/tesseract-ocr/tesseract
Windows°²×°°üµØÖ·£ºhttps://github.com/UB-Mannheim/tesseract/wiki
Tesseract-OCRµÄ°²×°
ÏÂÔØÍê°²×°°üºó£¬Ö±½ÓÔËÐа²×°¼´¿É£¬±È½ÏÖØÒªµÄÊÇ»·¾³±äÁ¿µÄÉèÖá£
½«°²×°Ä¿Â¼£¨D:\Program Files (x86)\Tesseract-OCR£©Ìí¼Ó½øPATH
н¨TESSDATA_PREFIXϵͳ±äÁ¿£¬ÖµÎªtessdata Îļþ¼ÐµÄ·¾¶£¨D:\Program
Files (x86)\Tesseract-OCR\tessdata£©
°²×°Python°üpytesseract£¨pip install pytesseract£©
Tesseract-OCRµÄʹÓÃ
ʹÓÃÆðÀ´·Ç³£µÄ¼òµ¥£¬´úÂëÈçÏ£º
from PIL import
Image
import pytesseract
import os
def copy_to_dir(filename):
image = Image.open(filename)
code = pytesseract.image_to_string(image, config="-c
tessedit" "_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
" --psm 10" " -l osd"
" ")
if not os.path.exists("dataset/" + code):
os.mkdir("dataset/" + code)
image.save("dataset/" + code + filename.replace("letters",
""))
image.close()
if __name__ == "__main__":
for filename in os.listdir('letters'):
current_file = 'letters/' + filename
copy_to_dir(current_file)
print(current_file) |
ÓÉÓÚTesseract-OCRʶ±ðµÄ׼ȷÂʷdz£µÄµÍ£¬ÍêÈ«²»ÄÜʹÓã¬·ÅÆú~£¬»¹ÊÇÐèÒªÊÖ¹¤ÕûÀí¡£
ͼƬ³ß´çͳһ
ÔÚÍê³ÉÈ˹¤´¦Àíºó£¬·¢ÏÖÇиîºóµÄͼƬ´óС²»Ò»¡£ÔÚ×Ö·ûʶ±ðǰÐèÒª¶ÔͼƬ½øÐÐµÄ³ß´ç½øÐÐͳһ¡£
¾ßÌåʵÏÖ·½·¨£º
import cv2
def image_resize(filename):
img = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
#¶ÁȡͼƬʱ²ÉÓõ¥Í¨µÀ
print(img)
if img.shape[0] != 10 or img.shape[1] != 6:
img = cv2.resize(img, (6, 10), interpolation=cv2.INTER_CUBIC)
print(img)
cv2.imwrite(filename, img) |
ʹÓÃcv2.resizeʱ£¬²ÎÊýÊäÈëÊÇ ¿í¡Á¸ß¡ÁͨµÀ£¬ÕâÀïʹÓõÄʱµ¥Í¨µÀµÄ£¬interpolationµÄÑ¡ÏîÓУº
INTER_NEAREST ×î½üÁÚ²åÖµ
INTER_LINEAR Ë«ÏßÐÔ²åÖµ£¨Ä¬ÈÏÉèÖã©
INTER_AREA ʹÓÃÏñËØÇøÓò¹ØÏµ½øÐÐÖØ²ÉÑù¡£ Ëü¿ÉÄÜÊÇͼÏñ³éÈ¡µÄÊ×Ñ¡·½·¨£¬ÒòΪËü»á²úÉúÎÞÔÆÎÆÀíµÄ½á¹û¡£
µ«Êǵ±Í¼ÏñËõ·Åʱ£¬ËüÀàËÆÓÚINTER_NEAREST·½·¨¡£
INTER_CUBIC 4¡Á4ÏñËØÁÚÓòµÄË«Èý´Î²åÖµ
INTER_LANCZOS4 8¡Á8ÏñËØÁÚÓòµÄLanczos²åÖµ
ÁíÍâΪÁËÈÃÊý¾Ý¸ü¼Ó±ãÓÚÀûÓ㬿ÉÒÔ½«Í¼Æ¬ÔÙ½øÐжþÖµ»¯µÄ¹éÒ»¡£¾ßÌå´úÂëÈçÏ£º
import cv2
import numpy as np
def image_normalize(filename):
img = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
#¶ÁȡͼƬʱ²ÉÓõ¥Í¨µÀ
if img.shape[0] != 10 or img.shape[1] != 6:
img = cv2.resize(img, (6, 10), interpolation=cv2.INTER_CUBIC)
normalized_img = np.zeros((6, 10)) # ¹éÒ»»¯
normalized_img = cv2.normalize(img, normalized_img,
0, 1, cv2.NORM_MINMAX)
cv2.imwrite(filename, normalized_img) |
¹éÒ»»¯µÄÀàÐÍ£¬¿ÉÒÔÓÐÒÔϵÄȡֵ£º
NORM_MINMAX:Êý×éµÄÊýÖµ±»Æ½ÒÆ»òËõ·Åµ½Ò»¸öÖ¸¶¨µÄ·¶Î§£¬ÏßÐÔ¹éÒ»»¯£¬Ò»°ã½Ï³£Óá£
NORM_INF:´ËÀàÐ͵͍ÒåûÓв鵽£¬¸ù¾ÝOpenCV 1µÄ¶ÔÓ¦Ï¿ÉÄÜÊǹéÒ»»¯Êý×éµÄC-·¶Êý(¾ø¶ÔÖµµÄ×î´óÖµ)
NORM_L1 : ¹éÒ»»¯Êý×éµÄL1-·¶Êý(¾ø¶ÔÖµµÄºÍ)
NORM_L2: ¹éÒ»»¯Êý×éµÄ(Å·¼¸ÀïµÂ)L2-·¶Êý
×Ö·ûʶ±ð
×Ö·ûͼƬ ¿í6¸öÏñËØ£¬¸ß10¸öÏñËØ £¬ÀíÂÛÉÏ¿ÉÒÔ×î¼òµ¥´Ö±©µØ¿ÉÒÔ¶¨Òå³ö60¸öÌØÕ÷£º60¸öÏñËØµãÉÏÃæµÄÏñËØÖµ¡£µ«ÊÇÏÔÈ»ÕâÑù¸ßά¶È±ØÈ»»áÔì³É¹ý´óµÄ¼ÆËãÁ¿£¬¿ÉÒÔÊʵ±µÄ½µÎ¬¡£±ÈÈ磺
ÿÐÐÉϺÚÉ«ÏñËØµÄ¸öÊý£¬¿ÉÒԵõ½10¸öÌØÕ÷
ÿÁÐÉϺÚÉ«ÏñËØµÄ¸öÊý£¬¿ÉÒԵõ½6¸öÌØÕ÷
from sklearn.neighbors
import KNeighborsClassifier
import os
from sklearn import preprocessing
import cv2
import numpy as np
import warnings
warnings.filterwarnings(module='sklearn*', action='ignore',
category=DeprecationWarning)
def get_feature(file_name):
img = cv2.imread(file_name, cv2.IMREAD_GRAYSCALE)
# ¶ÁȡͼƬʱ²ÉÓõ¥Í¨µÀ
height, width = img.shape
pixel_cnt_list = []
for y in range(height):
pix_cnt_x = 0
for x in range(width):
if img[y, x] == 0: # ºÚÉ«µã
pix_cnt_x += 1
pixel_cnt_list.append(pix_cnt_x)
for x in range(width):
pix_cnt_y = 0
for y in range(height):
if img[y, x] == 0: # ºÚÉ«µã
pix_cnt_y += 1
pixel_cnt_list.append(pix_cnt_y)
return pixel_cnt_list
if __name__ == "__main__":
test = get_feature("dataset/K/04a0844c-12f2-4344-9b78-ac1d28d746c0.png")
category = []
features = []
for dir_name in os.listdir('dataset'):
for filename in os.listdir('dataset/' + dir_name):
category.append(dir_name)
current_file = 'dataset/' + dir_name + '/' + filename
feature = get_feature(current_file)
features.append(feature)
# print(current_file)
le = preprocessing.LabelEncoder()
label = le.fit_transform(category)
model = KNeighborsClassifier(n_neighbors=1)
model.fit(features, label)
predicted= model.predict(np.array(test).reshape(1,
-1))
print(predicted)
print(le.inverse_transform(predicted)) |
ÕâÀïÖ±½ÓʹÓÃÁËsklearnÖеÄKNN·½·¨ |