from keras.callbacks import TensorBoard, ModelCheckpoint
from keras.utils import multi_gpu_model # 導入keras多卡函數
class ParallelModelCheckpoints(ModelCheckpoint): # 在儲存模型時,由于存在兩個模型,是以需要指定model,\
# 繼承ModelCheckpoint,重寫init()
def __init__(self,
model, # 需要儲存的模型
filepath='./log/epoch-{epoch:02d}_loss-{loss:.4f}_acc-{val_acc:.4f}_lr-{lr:.5f}.h5',
monitor='val_acc',
verbose=1,
save_best_only=True,
save_weights_only=False,
mode='auto',
period=1):
self.single_model = model
super(ParallelModelCheckpoints, self).__init__(filepath, monitor, verbose,save_best_only, save_weights_only, mode, period)
def set_model(self, model):
super(ParallelModelCheckpoints, self).set_model(self.single_model)
# 首先在cpu上建立原來的模型
with tf.device('/cpu:0'):
model = MobileNet(...)
# 建立多卡模型
parallel_model = multi_gpu_model(model, gpus=4) # 其中 4 是gpu的數量
parallel_model.load_weights(h5_path, by_name=True) # 繼續訓練的時候導入參數是用的parallel_model模型,而不是model
parallel_model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
model_checkpoint = ParallelModelCheckpoints(model) # 設定需要儲存h5的模型
print("Start training the model")
# 然後就可以訓練了
training_history = parallel_model.fit_generator(
train_generator,
steps_per_epoch=step_size_train,
validation_data=validation_generator,
validation_steps=step_size_valid,
epochs=epoch_list[-1],
verbose=1,
callbacks=[TensorBoard(log_dir='./tb'), model_checkpoint, stepDecayLR])
print("Model training finished")
參考網址:
- https://blog.csdn.net/xingkongyidian/article/details/88343115