Replies: 1 comment
-
the OpenVINO doc dosn't help very much: please check open model zoo link: open original model yolov2.cfg from darknet in https://netron.app/ , we can see for 608x608x3 (the original layout is channel first) input, region layer take 19x19x425 input, it's 19x19 cells array, each array contains 425 inputs: The region definition in yolov2.cfg:
this cfg entry will be parsed in parser.c#L314, which calls make_yolo_layer in @region_layer.c#L22, ignore the batch dimension, input shape (425,19,19) can be reshaped as (5, (80+1+4),19,19), as said in the paper, to get final box coordinates, following work must be performed: bx = δ(tx) + cx
by = δ(ty) + cy
bw = pw * exp(tw)
bh = ph * exp(th)
Pr(obj)*IOU((bx,by,bw,bh),obj)=δ(to)
//cx,cy comes from cell index
//pw,ph comes from cfg & anchor index
//δ is LOGISTIC RegionYolo is not well definedfrom the code of darknet & mkldnn, we can see that region-yolo is not a very well-defined layer, and it only performs the part of the work which is most performance demanding, the activation part: (without even using anchor data !!!): tx' = δ(tx)
ty' = δ(ty)
tw' = tw
th' = th
to'=δ(to)
p1'~pc'=softmax(p1~pc) so the output size or region yolo layer dosn't change. from the code, region layer use LOGISTIC activation function to map linear output from last convolution layer into 0~1, there are 2 parts, first is int entry_index(layer l, int batch, int location, int entry)
{
int n = location / (l.w*l.h);
int loc = location % (l.w*l.h);
return batch*l.outputs + n*l.w*l.h*(l.coords+l.classes+1) + entry*l.w*l.h + loc;
}
void forward_region_layer(const layer l, network net)
{
int i,j,b,t,n;
memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));
#ifndef GPU
for (b = 0; b < l.batch; ++b){
for(n = 0; n < l.n; ++n){
//-------------------------- δ(tx),δ(ty)
int index = entry_index(l, b, n*l.w*l.h, 0);
activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
//------δ(to)
index = entry_index(l, b, n*l.w*l.h, l.coords);
if(!l.background) activate_array(l.output + index, l.w*l.h, LOGISTIC);
index = entry_index(l, b, n*l.w*l.h, l.coords + 1);
if(!l.softmax && !l.softmax_tree) activate_array(l.output + index, l.classes*l.w*l.h, LOGISTIC); //---δ(p)
}
}
if (l.softmax_tree){
int i;
int count = l.coords + 1;
for (i = 0; i < l.softmax_tree->groups; ++i) {
int group_size = l.softmax_tree->group_size[i];
softmax_cpu(net.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count);
count += group_size;
}
} else if (l.softmax){
//---------------- softmax is applied across all 80 probabilities at same cell location
int index = entry_index(l, 0, 0, l.coords + !l.background);
softmax_cpu(net.input + index, l.classes + l.background, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index);
}
#endif Comparing to OpenVINO MKLDNN plugin: void MKLDNNRegionYoloNode::execute(mkldnn::stream strm) {
const auto &inShape = getParentEdgeAt(0)->getMemory().GetShape();
const auto &inDims = inShape.getStaticDims();
size_t B = (inShape.getRank() > 0) ? inDims[0] : 1;
size_t IC = (inShape.getRank() > 1) ? inDims[1] : 1;
size_t IH = (inShape.getRank() > 2) ? inDims[2] : 1;
size_t IW = (inShape.getRank() > 3) ? inDims[3] : 1;
size_t mask_size = mask.size();
int end_index = 0;
int num_ = 0;
int output_size = 0;
if (do_softmax) {
// Region layer (Yolo v2)
end_index = IW * IH;
num_ = num;
output_size = B * IH * IW * IC; // different shape combinations with the same overall size;
} else {
// Yolo layer (Yolo v3)
end_index = IW * IH * (classes + 1);
num_ = mask_size;
output_size = B * IH * IW * mask_size * (classes + coords + 1);
}
if (output_size != getChildEdgeAt(0)->getMemoryPtr()->GetShape().getElementsCount())
IE_THROW() << "Incorrect layer configuration or output dimensions. " << output_size << " != "
<< getChildEdgeAt(0)->getMemoryPtr()->GetShape().getElementsCount();
size_t inputs_size = IH * IW * num_ * (classes + coords + 1);
size_t total_size = 2 * IH * IW;
const auto *src_data = reinterpret_cast<const uint8_t *>(getParentEdgeAt(0)->getMemoryPtr()->GetPtr());
auto *dst_data = reinterpret_cast<uint8_t *>(getChildEdgeAt(0)->getMemoryPtr()->GetPtr());
cpu_convert(src_data, dst_data, getParentEdgeAt(0)->getMemory().getDesc().getPrecision(),
getChildEdgeAt(0)->getMemory().getDesc().getPrecision(), output_size);
for (int b = 0; b < B; b++) {
for (int n = 0; n < num_; n++) {
size_t index = b * inputs_size + n * IW * IH * (classes + coords + 1);
calculate_logistic(index, total_size, dst_data);
index = b * inputs_size + IW * IH * (n * (classes + coords + 1) + coords);
calculate_logistic(index, end_index, dst_data);
}
}
if (do_softmax) {
int index = IW * IH * (coords + 1);
int batch_offset = inputs_size / num;
for (int b = 0; b < B * num; b++) {
softmax_kernel->execute(src_data + input_prec.size() * (index + b * batch_offset),
dst_data + output_prec.size() * (index + b * batch_offset), 1, classes, IH, IW);
}
}
} |
Beta Was this translation helpful? Give feedback.
-
What is RegionYolo in openvino?
Beta Was this translation helpful? Give feedback.
All reactions