通過Keras搭建簡單的神經網絡,這裡以minist資料集為例,測試手寫字型訓練效果,并進行一些簡單的應用。
環境
在Windows下進行的測試,主要的安裝包如下:
- tensorflow_gpu==2.2.0
- imutils==0.5.4
- opencv_python==4.5.3.56
- scikit_image==0.18.3
- scikit_learn==0.24.2
- numpy==1.21.2
- py_sudoku==1.0.1
目錄結構如下:
搭建網絡
通過Keras來搭建幾層簡單網絡,可以用TensorFlow裡內建的Keras,或者單獨安裝Keras包。使用 MNIST 資料集來訓練模型識别數字。
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
class MnistNet:
@staticmethod
def build(width, height, depth, classes):
# 初始化模型
model = Sequential()
inputShape = (height, width, depth)
# 從 CONV 到 RELU 到 POOL layers
model.add(Conv2D(32, (5, 5), padding="same",
input_shape=inputShape))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
# 再次從 CONV 到 RELU 到 POOL layers
model.add(Conv2D(32, (3, 3), padding="same"))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
# FC層到relu層
model.add(Flatten())
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dropout(0.5))
# 再次FC層到relu層
model.add(Dense(64))
model.add(Activation("relu"))
model.add(Dropout(0.5))
# 用softmax函數分類
model.add(Dense(classes))
model.add(Activation("softmax"))
# 傳回模型
return model
訓練網絡
from mnistnet import MnistNet
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.datasets import mnist
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import classification_report
import argparse
# 構造參數
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", required=True,
help="model output path")
args = vars(ap.parse_args())
# 設定學習率,疊代次數,送入網絡批次大小
INIT_LR = 1e-3
EPOCHS = 16
Batch_Size = 160
# 擷取MNIST dataset
print("[LOGS] Please wait...")
((trainData, trainLabels), (testData, testLabels)) = mnist.load_data()
# 訓練資料和測試資料設定次元
print(trainData.shape[0])
trainData = trainData.reshape((trainData.shape[0], 28, 28, 1))
testData = testData.reshape((testData.shape[0], 28, 28, 1))
# 歸一化0-1之間
trainData = trainData.astype("float32") / 255.0
testData = testData.astype("float32") / 255.0
# 标簽轉為向量
le = LabelBinarizer()
trainLabels = le.fit_transform(trainLabels)
testLabels = le.transform(testLabels)
# 初始化模型,如果隻識别兩類則loss = “ binary_crossentropy”
opt = Adam(lr=INIT_LR)
model = MnistNet.build(width=28, height=28, depth=1, classes=10)
model.compile(loss="categorical_crossentropy", optimizer=opt,
metrics=["accuracy"])
print("[LOGS] compiling model...")
# 訓練
H = model.fit(
trainData, trainLabels,
validation_data=(testData, testLabels),
batch_size=Batch_Size,
epochs=EPOCHS,
verbose=1)
print("[LOGS] training network...")
# 評估網絡模型
predictions = model.predict(testData)
print("[LOGS] evaluating network...")
print(classification_report(
testLabels.argmax(axis=1),
predictions.argmax(axis=1),
target_names=[str(x) for x in le.classes_]))
# 儲存模型
model.save(args["model"], save_format="h5")
使用指令行輸入來啟動訓練:
python train_classifier.py --model model/model_mnist.h5
等待訓練完成,如下圖示意:
測試效果
通過手寫一些數字0-9來進行簡單的測試。
from tensorflow.keras.models import load_model
import cv2
import imutils
from imutils.contours import sort_contours
import numpy as np
#擷取圖像
imgPath = "image/test3.jpg"
model_path = "model/model_mnist.h5"
is_show = True
# 擷取視訊
vs_img = cv2.imread(imgPath)
# 加載模型
model = load_model(model_path)
model.summary()
# 調整大小
frame = imutils.resize(vs_img,width=200)
# 調試過程中可以顯示一下
# if Debug:
# cv2.imshow("frame",frame)
# cv2.waitKey(10)
# 轉為灰階圖
gray = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
# 濾波
bl = cv2.GaussianBlur(gray,(5,5),0)
# 邊緣檢測找輪廓
edge_canny = cv2.Canny(bl, 85, 200)
# 膨脹處理
kernel = np.ones((3,3),np.uint8)
edge_canny = cv2.dilate(edge_canny,kernel)
if is_show:
cv2.imshow("edge_canny", edge_canny)
cv2.waitKey(10)
items = cv2.findContours(edge_canny.copy(), cv2.RETR_EXTERNAL,cv2.CHAIN_APPROX_SIMPLE)
# 傳回conts中的countors(輪廓)
conts = items[0] if len(items) == 2 else items[1]
# print(conts)
# if is_show:
# cv2.drawContours(frame, conts, -1, (0, 255, 0), 1)
#
# cv2.imshow("src",frame)
# cv2.waitKey(10)
# 從左到右排序
conts,_ = sort_contours(conts,method="left-to-right")
# print(conts)
# 初始化清單放找到的字元
find_chars = []
#周遊找字元
for i in conts:
#print(np.array(i))
(x,y,w,h) = cv2.boundingRect(i)
# 過濾一下,找出字元邊框
if(w>2 and w< 100) and (h>5 and h< 100):
# 框字元
roi = gray[y:y+h,x:x+w]
mask = np.zeros(roi.shape,dtype="uint8")
digit = cv2.bitwise_and(roi, roi, mask=mask)
# 自動門檻值處理
_, th = cv2.threshold(roi, 0 ,255,cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)
# 寬高
th_H,th_W = th.shape
# 縮放到28尺寸
if th_H < th_W:
th = imutils.resize(th,width=28)
else:
th = imutils.resize(th,height=28)
# if is_show:
# cv2.imshow("th", th)
# cv2.waitKey(10)
# 縮放後的寬高
th_H, th_W = th.shape
dx = int(max(0,28-th_W)/2)
dy = int(max(0,28-th_H)/2)
# 填充到28x28
padding = cv2.copyMakeBorder(th,top=dy,bottom=dy,left=dx,right=dx,
borderType=cv2.BORDER_CONSTANT,value=(0,0,0))
padding = cv2.resize(padding,(28,28))
# 縮放到0-1,擴充次元
padding = padding.astype("float32")/255.0
padding = np.expand_dims(padding,axis=-1)
#存入清單
print(((x,y,w,h)))
find_chars.append((padding,(x,y,w,h)))
else:
print("next ... ")
continue
# 提取
boxes = [b[1] for b in find_chars]
find_chars = np.array([f[0] for f in find_chars], dtype="float32")
if find_chars is None:
print("can not find chars ...")
# 放入模型
predicts = model.predict(find_chars)
# 标簽
labels = "0123456789"
# 預測顯示
for (pred, (x,y,w,h)) in zip(predicts,boxes):
# 傳回最大值
p = np.argmax(pred)
pre = pred[p]
label = labels[p]
# 繪制框顯示
cv2.rectangle(frame,(x,y),(x+w,y+h),(255,0,0),2)
cv2.putText(frame,label,(x-10,y-10),cv2.FONT_HERSHEY_SIMPLEX,1,(0,255,0),2)
cv2.imshow("result",frame)
cv2.waitKey(10)
測試效果如下所示:
簡單益智遊戲應用
拿上面訓練好的數字模型來識别數獨闆中的數字并解決數獨填空。
流程如下:
- 輸入一張待解謎的數獨圖像;
- 在圖像中找到每個數字的位置;
- 給數獨劃分網格,一般是9x9,計算得到每個格子的位置;
- 判斷格子中是否有數字,有的話就進行OCR識别;
- 用數獨算法來解謎題;
- 結果輸出顯示
識别主要代碼如下:
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", required=True,
help="path to trained digit classifier")
ap.add_argument("-i", "--image", required=True,
help="path to input sudoku puzzle image")
ap.add_argument("-d", "--is_show", type=int, default=-1,
help="is show each step ")
args = vars(ap.parse_args())
# 加載模型
model = load_model(args["model"])
print("loading digit classifier...")
# 擷取圖像
image = cv2.imread(args["image"])
print("processing image...")
if image is None:
print("could not load image ...")
# 調整大小
image = imutils.resize(image, width=400)
src = image.copy()
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
# 9x9 數獨格闆
board_9 = np.zeros((9, 9), dtype="int")
# 寬度和高度方向上單個小方格尺寸
stepX = gray.shape[1] // 9
stepY = gray.shape[0] // 9
# 存每個小格子位置
each_loc = []
# 擷取格子位置
for y in range(0, 9):
# 目前格子位置
c_row = []
for x in range(0, 9):
# 目前格子坐标
startX = x * stepX
startY = y * stepY
endX = (x + 1) * stepX
endY = (y + 1) * stepY
# 存下來
c_row.append((startX, startY, endX, endY))
# 拿到小格子,并提取數字
grid_img = gray[startY:endY, startX:endX]
number = extract_number(grid_img, is_show=False)
# 判斷一下
if number is not None:
two_h = np.hstack([grid_img, number])
# cv2.imshow("grid_img/number", two_h)
# 将格子圖縮放到28x28
roi = cv2.resize(number, (28, 28))
roi = roi.astype("float") / 255.0
roi = img_to_array(roi)
roi = np.expand_dims(roi, axis=0)
# 預測格子裡的數字
pred = model.predict(roi).argmax(axis=1)[0]
board_9[y, x] = pred
# 放入清單
each_loc.append(c_row)
# 數獨闆并顯示
print("OCR sudoku board:")
makeup = Sudoku(3, 3, board=board_9.tolist())
makeup.show()
# 計算填寫
print("solving sudoku makeup...")
solution = makeup.solve()
solution.show_full()
# 周遊每個格子
for (grid, b) in zip(each_loc, solution.board):
for (box, n) in zip(grid, b):
# 坐标位置
startX, startY, endX, endY = box
# 顯示資訊
textX = int((endX - startX) * 0.3)
textY = int((endY - startY) * -0.25)
textX += startX
textY += endY
cv2.putText(src, str(n), (textX, textY),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 0), 2)
cv2.imshow("Results", src)
cv2.waitKey(0)
cv2.imwrite("output/res.jpg",src)
測試結果:
綠色為識别後解出來數字。
代碼
完整代碼:
https://github.com/ssggle/keras_mnistnet
Reference
https://keras.io/examples/
http://yann.lecun.com/exdb/mnist/