源代碼位址:https://github.com/pierluigiferrari/ssd_keras
1.資料輸入存儲
object_detection_2d_data_generator.py
修改資料存儲格式 整形改成浮點型(但意味着存儲空間擴大2倍):
hdf5_labels = hdf5_dataset.create_dataset(name='labels',
shape=(dataset_size,),
maxshape=(None),
dtype=h5py.special_dtype(vlen=np.float))
添加資料字段,如添加angle.
在parse_xml()中修改item_dict以及相關輸出格式,添加相關字段:
self.labels_output_format = labels_output_format
self.labels_format={'class_id': labels_output_format.index('class_id'),
'xmin': labels_output_format.index('xmin'),
'ymin': labels_output_format.index('ymin'),
'xmax': labels_output_format.index('xmax'),
'ymax': labels_output_format.index('ymax'),
'x1': labels_output_format.index('x1'),
'y1': labels_output_format.index('y1'),
'x2': labels_output_format.index('x2'),
'y2': labels_output_format.index('y2'),
'h': labels_output_format.index('h')
}
2.資料編碼
ssd_input_encoder.py
添加新增字段索引:
class_id = 0
xmin = 1
ymin = 2
xmax = 3
ymax = 4
x1 = 5
y1 = 6
x2 = 7
y2 = 8
h = 9
每個batchsize 的所有資料都存在:
y_encoded = self.generate_encoding_template(batch_size=batch_size, diagnostics=False)
上述語句即先初始化y_encoded模闆,定義好資料規模,因為新增了字段是以需要修改 self.generate_encoding_template(batch_size=batch_size, diagnostics=False)函數。需要修改以及新增代碼如下:
rotatetensor = np.zeros((batch_size, boxes_tensor.shape[1], 5))
cx = boxes_tensor[..., 0]
cy = boxes_tensor[..., 1]
w = boxes_tensor[..., 2]
h = boxes_tensor[..., 3]
rotatetensor[..., 0] = cx-w/2
rotatetensor[..., 1] = cy-h/2
rotatetensor[..., 2] = cx+w/2
rotatetensor[..., 3] = cy-h/2
rotatetensor[..., 4] = h
y_encoding_template = np.concatenate((classes_tensor, boxes_tensor,rotatetensor, boxes_tensor, variances_tensor), axis=2)
資料歸一化代碼:
if self.normalize_coords:
labels[:,[ymin,ymax]] /= self.img_height # Normalize ymin and ymax relative to the image height
labels[:,[xmin,xmax]] /= self.img_width
預設框與真實框比對時,将真實資料賦予相應位置,帶有索引的注意修改相應索引值:
y_encoded[i, bipartite_matches, :-13] = labels_one_hot
y_encoded[i, bipartite_matches, -13:-8] = labels[:, [x1,y1,x2,y2,h]]
多比對政策:
y_encoded[i, matches[1], :-13] = labels_one_hot[matches[0]]
for k in range(len(matches[1])):
y_encoded[i, matches[1][k], -13:-8] = labels[matches[0][k],-5:]
得到偏移量(也是相應要預測的值),注意此時的cx,cy已經不是原圖像标注對應的值了(經過資料增強,随機裁剪):
if self.coords == 'centroids':
y_encoded[:,:,[-17,-16]] -= y_encoded[:,:,[-8,-7]] # cx(gt) - cx(anchor), cy(gt) - cy(anchor)
# print('0000002', y_encoded[:, matches[1], -17:-15])
y_encoded[:,:,[-17,-16]] /= y_encoded[:,:,[-6,-5]] * y_encoded[:,:,[-4,-3]] # (cx(gt) - cx(anchor)) / w(anchor) / cx_variance, (cy(gt) - cy(anchor)) / h(anchor) / cy_variance
y_encoded[:,:,[-15,-14]] /= y_encoded[:,:,[-6,-5]] # w(gt) / w(anchor), h(gt) / h(anchor)
y_encoded[:,:,[-15,-14]] = np.log(y_encoded[:,:,[-15,-14]]) / y_encoded[:,:,[-2,-1]] # ln(w(gt) / w(anchor)) / w_variance, ln(h(gt) / h(anchor)) / h_variance (ln == natural logarithm)
# print(anchorcx[:, matches[1]])
# 相對 default anchor
anchorcx=y_encoded[:,:, -8]
anchorcy=y_encoded[:,:, -7]
anchorw=y_encoded[:,:, -6]
anchorh=y_encoded[:,:, -5]
anchorx1=(anchorcx-anchorw/2)
anchory1=(anchorcy-anchorh/2)
y_encoded[:, :, -13] -= anchorx1 # x1 offset
y_encoded[:, :, -12] -= anchory1 # y1 offset
y_encoded[:, :, [-13,-12]] /= y_encoded[:,:, [-6,-5]]*y_encoded[:,:,[-4,-3]]
y_encoded[:, :, -11] -= (anchorcx+anchorw/2) # x2 offset
y_encoded[:, :, -10] -= (anchorcy-anchorh/2) # y2 offset
y_encoded[:, :, [-11, -10]] /= y_encoded[:, :, [-6, -5]]*y_encoded[:,:,[-4,-3]]
y_encoded[:, :, -9] /= y_encoded[:, :, -5]
y_encoded[:, :, -9] = np.log(y_encoded[:, :, -9]) / y_encoded[:, :, -1]
resnet_keras_ssd300.py 增加了通道值,同時預測時也要增加通道預測值:
# We predict 4 box coordinates for each box, hence the localization predictors have depth `n_boxes * 4`
# Output shape of the localization layers: `(batch, height, width, n_boxes * 4)`
conv4_3_norm_mbox_loc = Conv2D(n_boxes[0] * 6, (3, 3), padding='same', kernel_initializer='he_normal',
kernel_regularizer=l2(l2_reg), name='conv4_3_norm_mbox_loc')(conv4_3_norm)
fc7_mbox_loc = Conv2D(n_boxes[1] * 6, (3, 3), padding='same', kernel_initializer='he_normal',
kernel_regularizer=l2(l2_reg), name='fc7_mbox_loc')(fc7)
conv6_2_mbox_loc = Conv2D(n_boxes[2] * 6, (3, 3), padding='same', kernel_initializer='he_normal',
kernel_regularizer=l2(l2_reg), name='conv6_2_mbox_loc')(conv6_2)
conv7_2_mbox_loc = Conv2D(n_boxes[3] * 6, (3, 3), padding='same', kernel_initializer='he_normal',
kernel_regularizer=l2(l2_reg), name='conv7_2_mbox_loc')(conv7_2)
conv8_2_mbox_loc = Conv2D(n_boxes[4] * 6, (3, 3), padding='same', kernel_initializer='he_normal',
kernel_regularizer=l2(l2_reg), name='conv8_2_mbox_loc')(conv8_2)
conv9_2_mbox_loc = Conv2D(n_boxes[5] * 6, (3, 3), padding='same', kernel_initializer='he_normal',
kernel_regularizer=l2(l2_reg), name='conv9_2_mbox_loc')(conv9_2)
# Reshape the box predictions, yielding 3D tensors of shape `(batch, height * width * n_boxes, 4)`
# We want the four box coordinates isolated in the last axis to compute the smooth L1 loss
conv4_3_norm_mbox_loc_reshape = Reshape((-1, 6), name='conv4_3_norm_mbox_loc_reshape')(conv4_3_norm_mbox_loc)
fc7_mbox_loc_reshape = Reshape((-1, 6), name='fc7_mbox_loc_reshape')(fc7_mbox_loc)
conv6_2_mbox_loc_reshape = Reshape((-1, 6), name='conv6_2_mbox_loc_reshape')(conv6_2_mbox_loc)
conv7_2_mbox_loc_reshape = Reshape((-1, 6), name='conv7_2_mbox_loc_reshape')(conv7_2_mbox_loc)
conv8_2_mbox_loc_reshape = Reshape((-1, 6), name='conv8_2_mbox_loc_reshape')(conv8_2_mbox_loc)
conv9_2_mbox_loc_reshape = Reshape((-1, 6), name='conv9_2_mbox_loc_reshape')(conv9_2_mbox_loc)
3.預測解碼
x1 = y_pred[..., -13] * y_pred[..., -4] * y_pred[..., -6] + myxmin
y1 = y_pred[..., -12] * y_pred[..., -3] * y_pred[..., -5] + myymin
x2 = y_pred[..., -11] * y_pred[..., -4] * y_pred[..., -6] + myxmax
y2 = y_pred[..., -10] * y_pred[..., -3] * y_pred[..., -5] + myymin
h = tf.exp(y_pred[...,-9] * y_pred[...,-1]) * y_pred[...,-5]
# If the model predicts box coordinates relative to the image dimensions and they are supposed
# to be converted back to absolute coordinates, do that.
def normalized_coords():
xmin1 = tf.expand_dims(xmin * self.tf_img_width, axis=-1)
ymin1 = tf.expand_dims(ymin * self.tf_img_height, axis=-1)
xmax1 = tf.expand_dims(xmax * self.tf_img_width, axis=-1)
ymax1 = tf.expand_dims(ymax * self.tf_img_height, axis=-1)
mx1=tf.expand_dims(x1 * self.tf_img_width, axis=-1)
my1 = tf.expand_dims(y1 * self.tf_img_height, axis=-1)
mx2 = tf.expand_dims(x2 * self.tf_img_width, axis=-1)
my2 = tf.expand_dims(y2 * self.tf_img_height, axis=-1)
mh = tf.expand_dims(h * self.tf_img_height, axis=-1)
return xmin1, ymin1, xmax1, ymax1,mx1,my1,mx2,my2,mh
def non_normalized_coords():
return tf.expand_dims(xmin, axis=-1), tf.expand_dims(ymin, axis=-1), tf.expand_dims(xmax, axis=-1), tf.expand_dims(ymax, axis=-1), \
tf.expand_dims(x1, axis=-1),tf.expand_dims(y1, axis=-1),tf.expand_dims(x2, axis=-1),tf.expand_dims(y2, axis=-1),tf.expand_dims(h, axis=-1)
xmin, ymin, xmax, ymax,x1,y1,x2,y2,h= tf.cond(self.tf_normalize_coords, normalized_coords, non_normalized_coords)
注意修改n_classes,以及輸出次元:
n_classes = y_pred.shape[2] - 6
....
box_coordinates = batch_item[...,-6:]
.....
def no_confident_predictions():
return tf.constant(value=0.0, shape=(1,8))
.....
filtered_predictions = tf.reshape(tensor=filtered_single_classes, shape=(-1,8))
.....
def compute_output_shape(self, input_shape):
batch_size, n_boxes, last_axis = input_shape
return (batch_size, self.tf_top_k, 8) # Last axis: (class_ID, confidence, 4 box coordinates)
def filter_single_class(index):
# From a tensor of shape (n_boxes, n_classes + 4 coordinates) extract
# a tensor of shape (n_boxes, 1 + 4 coordinates) that contains the
# confidnece values for just one class, determined by `index`.
confidences = tf.expand_dims(batch_item[..., index], axis=-1)
class_id = tf.fill(dims=tf.shape(confidences), value=tf.to_float(index))
box_coordinates = batch_item[...,-6:] #**************
single_class = tf.concat([class_id, confidences, box_coordinates], axis=-1)
# Apply confidence thresholding with respect to the class defined by `index`.
threshold_met = single_class[:,1] > self.tf_confidence_thresh
single_class = tf.boolean_mask(tensor=single_class,
mask=threshold_met)
# If any boxes made the threshold, perform NMS.
def perform_nms():
scores = single_class[...,1]
# `tf.image.non_max_suppression()` needs the box coordinates in the format `(ymin, xmin, ymax, xmax)`.
xmin = tf.expand_dims(single_class[...,-6], axis=-1) #**************
ymin = tf.expand_dims(single_class[...,-5], axis=-1) #**************
xmax = tf.expand_dims(single_class[...,-4], axis=-1) #**************
ymax = tf.expand_dims(single_class[...,-3], axis=-1) #**************
boxes = tf.concat(values=[ymin, xmin, ymax, xmax], axis=-1)
maxima_indices = tf.image.non_max_suppression(boxes=boxes,
scores=scores,
max_output_size=self.tf_nms_max_output_size,
iou_threshold=self.iou_threshold,
name='non_maximum_suppresion')
maxima = tf.gather(params=single_class,
indices=maxima_indices,
axis=0)
return maxima
4.資料增強
data_augmentation_chain_original_ssd.py
将影響資料輸出的增強暫時去掉(後期有待優化)
self.sequence = [
# self.photometric_distortions,
# self.expand,
# self.random_crop,
# self.random_flip,
self.resize]