35 files changed, 9800 insertions, 0 deletions
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..a7739be
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,78 @@
+macro(ncnn_add_example name)
+    add_executable(${name} ${name}.cpp)
+    if(OpenCV_FOUND)
+        target_include_directories(${name} PRIVATE ${OpenCV_INCLUDE_DIRS})
+        target_link_libraries(${name} PRIVATE ncnn ${OpenCV_LIBS})
+    elseif(NCNN_SIMPLEOCV)
+        target_compile_definitions(${name} PUBLIC USE_NCNN_SIMPLEOCV)
+        target_link_libraries(${name} PRIVATE ncnn)
+    endif()
+
+    # add test to a virtual project group
+    set_property(TARGET ${name} PROPERTY FOLDER "examples")
+endmacro()
+
+if(NCNN_PIXEL)
+    if(NOT NCNN_SIMPLEOCV)
+        find_package(OpenCV QUIET COMPONENTS opencv_world)
+        # for opencv 2.4 on ubuntu 16.04, there is no opencv_world but OpenCV_FOUND will be TRUE
+        if("${OpenCV_LIBS}" STREQUAL "")
+            set(OpenCV_FOUND FALSE)
+        endif()
+        if(NOT OpenCV_FOUND)
+            find_package(OpenCV QUIET COMPONENTS core highgui imgproc imgcodecs videoio)
+        endif()
+        if(NOT OpenCV_FOUND)
+            find_package(OpenCV QUIET COMPONENTS core highgui imgproc)
+        endif()
+    endif()
+
+    if(OpenCV_FOUND OR NCNN_SIMPLEOCV)
+        if(OpenCV_FOUND)
+            message(STATUS "OpenCV library: ${OpenCV_INSTALL_PATH}")
+            message(STATUS "    version: ${OpenCV_VERSION}")
+            message(STATUS "    libraries: ${OpenCV_LIBS}")
+            message(STATUS "    include path: ${OpenCV_INCLUDE_DIRS}")
+
+            if(${OpenCV_VERSION_MAJOR} GREATER 3)
+                set(CMAKE_CXX_STANDARD 11)
+            endif()
+        endif()
+
+        include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
+        include_directories(${CMAKE_CURRENT_BINARY_DIR}/../src)
+
+        ncnn_add_example(squeezenet)
+        ncnn_add_example(squeezenet_c_api)
+        ncnn_add_example(fasterrcnn)
+        ncnn_add_example(rfcn)
+        ncnn_add_example(yolov2)
+        ncnn_add_example(yolov3)
+        ncnn_add_example(yolov5)
+        ncnn_add_example(yolov5_pnnx)
+        ncnn_add_example(yolov7_pnnx)
+        ncnn_add_example(yolov7)
+        ncnn_add_example(yolox)
+        ncnn_add_example(mobilenetv2ssdlite)
+        ncnn_add_example(mobilenetssd)
+        ncnn_add_example(squeezenetssd)
+        ncnn_add_example(shufflenetv2)
+        ncnn_add_example(peleenetssd_seg)
+        ncnn_add_example(simplepose)
+        ncnn_add_example(retinaface)
+        ncnn_add_example(yolact)
+        ncnn_add_example(nanodet)
+        ncnn_add_example(nanodetplus_pnnx)
+        ncnn_add_example(scrfd)
+        ncnn_add_example(scrfd_crowdhuman)
+        if(OpenCV_FOUND)
+            ncnn_add_example(yolov4)
+            ncnn_add_example(rvm)
+            ncnn_add_example(p2pnet)
+        endif()
+    else()
+        message(WARNING "OpenCV not found and NCNN_SIMPLEOCV disabled, examples won't be built")
+    endif()
+else()
+    message(WARNING "NCNN_PIXEL not enabled, examples won't be built")
+endif()
diff --git a/examples/fasterrcnn.cpp b/examples/fasterrcnn.cpp
new file mode 100644
index 0000000..48aa106
--- /dev/null
+++ b/examples/fasterrcnn.cpp
@@ -0,0 +1,363 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <math.h>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static int detect_fasterrcnn(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net fasterrcnn;
+
+    fasterrcnn.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/rbgirshick/py-faster-rcnn
+    // py-faster-rcnn/models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt
+    // https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0
+    // ZF_faster_rcnn_final.caffemodel
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (fasterrcnn.load_param("ZF_faster_rcnn_final.param"))
+        exit(-1);
+    if (fasterrcnn.load_model("ZF_faster_rcnn_final.bin"))
+        exit(-1);
+
+    // hyper parameters taken from
+    // py-faster-rcnn/lib/fast_rcnn/config.py
+    // py-faster-rcnn/lib/fast_rcnn/test.py
+    const int target_size = 600; // __C.TEST.SCALES
+
+    const int max_per_image = 100;
+    const float confidence_thresh = 0.05f;
+
+    const float nms_threshold = 0.3f; // __C.TEST.NMS
+
+    // scale to target detect size
+    int w = bgr.cols;
+    int h = bgr.rows;
+    float scale = 1.f;
+    if (w < h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h);
+
+    const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Mat im_info(3);
+    im_info[0] = h;
+    im_info[1] = w;
+    im_info[2] = scale;
+
+    // step1, extract feature and all rois
+    ncnn::Extractor ex1 = fasterrcnn.create_extractor();
+
+    ex1.input("data", in);
+    ex1.input("im_info", im_info);
+
+    ncnn::Mat conv5_relu5; // feature
+    ncnn::Mat rois;        // all rois
+    ex1.extract("conv5_relu5", conv5_relu5);
+    ex1.extract("rois", rois);
+
+    // step2, extract bbox and score for each roi
+    std::vector<std::vector<Object> > class_candidates;
+    for (int i = 0; i < rois.c; i++)
+    {
+        ncnn::Extractor ex2 = fasterrcnn.create_extractor();
+
+        ncnn::Mat roi = rois.channel(i); // get single roi
+        ex2.input("conv5_relu5", conv5_relu5);
+        ex2.input("rois", roi);
+
+        ncnn::Mat bbox_pred;
+        ncnn::Mat cls_prob;
+        ex2.extract("bbox_pred", bbox_pred);
+        ex2.extract("cls_prob", cls_prob);
+
+        int num_class = cls_prob.w;
+        class_candidates.resize(num_class);
+
+        // find class id with highest score
+        int label = 0;
+        float score = 0.f;
+        for (int i = 0; i < num_class; i++)
+        {
+            float class_score = cls_prob[i];
+            if (class_score > score)
+            {
+                label = i;
+                score = class_score;
+            }
+        }
+
+        // ignore background or low score
+        if (label == 0 || score <= confidence_thresh)
+            continue;
+
+        //         fprintf(stderr, "%d = %f\n", label, score);
+
+        // unscale to image size
+        float x1 = roi[0] / scale;
+        float y1 = roi[1] / scale;
+        float x2 = roi[2] / scale;
+        float y2 = roi[3] / scale;
+
+        float pb_w = x2 - x1 + 1;
+        float pb_h = y2 - y1 + 1;
+
+        // apply bbox regression
+        float dx = bbox_pred[label * 4];
+        float dy = bbox_pred[label * 4 + 1];
+        float dw = bbox_pred[label * 4 + 2];
+        float dh = bbox_pred[label * 4 + 3];
+
+        float cx = x1 + pb_w * 0.5f;
+        float cy = y1 + pb_h * 0.5f;
+
+        float obj_cx = cx + pb_w * dx;
+        float obj_cy = cy + pb_h * dy;
+
+        float obj_w = pb_w * exp(dw);
+        float obj_h = pb_h * exp(dh);
+
+        float obj_x1 = obj_cx - obj_w * 0.5f;
+        float obj_y1 = obj_cy - obj_h * 0.5f;
+        float obj_x2 = obj_cx + obj_w * 0.5f;
+        float obj_y2 = obj_cy + obj_h * 0.5f;
+
+        // clip
+        obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f);
+
+        // append object
+        Object obj;
+        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
+        obj.label = label;
+        obj.prob = score;
+
+        class_candidates[label].push_back(obj);
+    }
+
+    // post process
+    objects.clear();
+    for (int i = 0; i < (int)class_candidates.size(); i++)
+    {
+        std::vector<Object>& candidates = class_candidates[i];
+
+        qsort_descent_inplace(candidates);
+
+        std::vector<int> picked;
+        nms_sorted_bboxes(candidates, picked, nms_threshold);
+
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            int z = picked[j];
+            objects.push_back(candidates[z]);
+        }
+    }
+
+    qsort_descent_inplace(objects);
+
+    if (max_per_image > 0 && max_per_image < objects.size())
+    {
+        objects.resize(max_per_image);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_fasterrcnn(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/mobilenetssd.cpp b/examples/mobilenetssd.cpp
new file mode 100644
index 0000000..59ea209
--- /dev/null
+++ b/examples/mobilenetssd.cpp
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_mobilenet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net mobilenet;
+
+    mobilenet.opt.use_vulkan_compute = true;
+
+    // model is converted from https://github.com/chuanqi305/MobileNet-SSD
+    // and can be downloaded from https://drive.google.com/open?id=0ByaKLD9QaPtucWk0Y0dha1VVY0U
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (mobilenet.load_param("mobilenet_ssd_voc_ncnn.param"))
+        exit(-1);
+    if (mobilenet.load_model("mobilenet_ssd_voc_ncnn.bin"))
+        exit(-1);
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = mobilenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_mobilenet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/mobilenetv2ssdlite.cpp b/examples/mobilenetv2ssdlite.cpp
new file mode 100644
index 0000000..e1650e1
--- /dev/null
+++ b/examples/mobilenetv2ssdlite.cpp
@@ -0,0 +1,161 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+class Noop : public ncnn::Layer
+{
+};
+DEFINE_LAYER_CREATOR(Noop)
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_mobilenetv2(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net mobilenetv2;
+
+    mobilenetv2.opt.use_vulkan_compute = true;
+
+    mobilenetv2.register_custom_layer("Silence", Noop_layer_creator);
+
+    // original pretrained model from https://github.com/chuanqi305/MobileNetv2-SSDLite
+    // https://github.com/chuanqi305/MobileNetv2-SSDLite/blob/master/ssdlite/voc/deploy.prototxt
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (mobilenetv2.load_param("mobilenetv2_ssdlite_voc.param"))
+        exit(-1);
+    if (mobilenetv2.load_model("mobilenetv2_ssdlite_voc.bin"))
+        exit(-1);
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1.0 / 127.5, 1.0 / 127.5, 1.0 / 127.5};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = mobilenetv2.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_mobilenetv2(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/mobilenetv3ssdlite.cpp b/examples/mobilenetv3ssdlite.cpp
new file mode 100644
index 0000000..724e501
--- /dev/null
+++ b/examples/mobilenetv3ssdlite.cpp
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+#include "platform.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+#if NCNN_VULKAN
+#include "gpu.h"
+#endif // NCNN_VULKAN
+
+template<class T>
+const T& clamp(const T& v, const T& lo, const T& hi)
+{
+    assert(!(hi < lo));
+    return v < lo ? lo : hi < v ? hi : v;
+}
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_mobilenetv3(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net mobilenetv3;
+
+#if NCNN_VULKAN
+    mobilenetv3.opt.use_vulkan_compute = true;
+#endif // NCNN_VULKAN
+
+    // converted ncnn model from https://github.com/ujsyehao/mobilenetv3-ssd
+    if (mobilenetv3.load_param("./mobilenetv3_ssdlite_voc.param"))
+        exit(-1);
+    if (mobilenetv3.load_model("./mobilenetv3_ssdlite_voc.bin"))
+        exit(-1);
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {123.675f, 116.28f, 103.53f};
+    const float norm_vals[3] = {1.0f, 1.0f, 1.0f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = mobilenetv3.create_extractor();
+
+    ex.input("input", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+
+        // filter out cross-boundary
+        float x1 = clamp(values[2] * target_size, 0.f, float(target_size - 1)) / target_size * img_w;
+        float y1 = clamp(values[3] * target_size, 0.f, float(target_size - 1)) / target_size * img_h;
+        float x2 = clamp(values[4] * target_size, 0.f, float(target_size - 1)) / target_size * img_w;
+        float y2 = clamp(values[5] * target_size, 0.f, float(target_size - 1)) / target_size * img_h;
+
+        object.rect.x = x1;
+        object.rect.y = y1;
+        object.rect.width = x2 - x1;
+        object.rect.height = y2 - y1;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        if (objects[i].prob > 0.6)
+        {
+            const Object& obj = objects[i];
+
+            fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                    obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+            cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+            char text[256];
+            sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+            int baseLine = 0;
+            cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+            int x = obj.rect.x;
+            int y = obj.rect.y - label_size.height - baseLine;
+            if (y < 0)
+                y = 0;
+            if (x + label_size.width > image.cols)
+                x = image.cols - label_size.width;
+
+            cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                          cv::Scalar(255, 255, 255), -1);
+
+            cv::putText(image, text, cv::Point(x, y + label_size.height),
+                        cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+        }
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_mobilenetv3(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/nanodet.cpp b/examples/nanodet.cpp
new file mode 100644
index 0000000..2dafd90
--- /dev/null
+++ b/examples/nanodet.cpp
@@ -0,0 +1,425 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static void generate_proposals(const ncnn::Mat& cls_pred, const ncnn::Mat& dis_pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = cls_pred.h;
+
+    int num_grid_x;
+    int num_grid_y;
+    if (in_pad.w > in_pad.h)
+    {
+        num_grid_x = in_pad.w / stride;
+        num_grid_y = num_grid / num_grid_x;
+    }
+    else
+    {
+        num_grid_y = in_pad.h / stride;
+        num_grid_x = num_grid / num_grid_y;
+    }
+
+    const int num_class = cls_pred.w;
+    const int reg_max_1 = dis_pred.w / 4;
+
+    for (int i = 0; i < num_grid_y; i++)
+    {
+        for (int j = 0; j < num_grid_x; j++)
+        {
+            const int idx = i * num_grid_x + j;
+
+            const float* scores = cls_pred.row(idx);
+
+            // find label with max score
+            int label = -1;
+            float score = -FLT_MAX;
+            for (int k = 0; k < num_class; k++)
+            {
+                if (scores[k] > score)
+                {
+                    label = k;
+                    score = scores[k];
+                }
+            }
+
+            if (score >= prob_threshold)
+            {
+                ncnn::Mat bbox_pred(reg_max_1, 4, (void*)dis_pred.row(idx));
+                {
+                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");
+
+                    ncnn::ParamDict pd;
+                    pd.set(0, 1); // axis
+                    pd.set(1, 1);
+                    softmax->load_param(pd);
+
+                    ncnn::Option opt;
+                    opt.num_threads = 1;
+                    opt.use_packing_layout = false;
+
+                    softmax->create_pipeline(opt);
+
+                    softmax->forward_inplace(bbox_pred, opt);
+
+                    softmax->destroy_pipeline(opt);
+
+                    delete softmax;
+                }
+
+                float pred_ltrb[4];
+                for (int k = 0; k < 4; k++)
+                {
+                    float dis = 0.f;
+                    const float* dis_after_sm = bbox_pred.row(k);
+                    for (int l = 0; l < reg_max_1; l++)
+                    {
+                        dis += l * dis_after_sm[l];
+                    }
+
+                    pred_ltrb[k] = dis * stride;
+                }
+
+                float pb_cx = (j + 0.5f) * stride;
+                float pb_cy = (i + 0.5f) * stride;
+
+                float x0 = pb_cx - pred_ltrb[0];
+                float y0 = pb_cy - pred_ltrb[1];
+                float x1 = pb_cx + pred_ltrb[2];
+                float y1 = pb_cy + pred_ltrb[3];
+
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = x1 - x0;
+                obj.rect.height = y1 - y0;
+                obj.label = label;
+                obj.prob = score;
+
+                objects.push_back(obj);
+            }
+        }
+    }
+}
+
+static int detect_nanodet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net nanodet;
+
+    nanodet.opt.use_vulkan_compute = true;
+    // nanodet.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/RangiLyu/nanodet
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (nanodet.load_param("nanodet_m.param"))
+        exit(-1);
+    if (nanodet.load_model("nanodet_m.bin"))
+        exit(-1);
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    const int target_size = 320;
+    const float prob_threshold = 0.4f;
+    const float nms_threshold = 0.5f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+    const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = nanodet.create_extractor();
+
+    ex.input("input.1", in_pad);
+
+    std::vector<Object> proposals;
+
+    // stride 8
+    {
+        ncnn::Mat cls_pred;
+        ncnn::Mat dis_pred;
+        ex.extract("792", cls_pred);
+        ex.extract("795", dis_pred);
+
+        std::vector<Object> objects8;
+        generate_proposals(cls_pred, dis_pred, 8, in_pad, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat cls_pred;
+        ncnn::Mat dis_pred;
+        ex.extract("814", cls_pred);
+        ex.extract("817", dis_pred);
+
+        std::vector<Object> objects16;
+        generate_proposals(cls_pred, dis_pred, 16, in_pad, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat cls_pred;
+        ncnn::Mat dis_pred;
+        ex.extract("836", cls_pred);
+        ex.extract("839", dis_pred);
+
+        std::vector<Object> objects32;
+        generate_proposals(cls_pred, dis_pred, 32, in_pad, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_nanodet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/nanodetplus_pnnx.cpp b/examples/nanodetplus_pnnx.cpp
new file mode 100644
index 0000000..7aa3ed1
--- /dev/null
+++ b/examples/nanodetplus_pnnx.cpp
@@ -0,0 +1,431 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return 1.0f / (1.0f + exp(-x));
+}
+
+static void generate_proposals(const ncnn::Mat& pred, int stride, const ncnn::Mat& in_pad, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = pred.h;
+
+    int num_grid_x = pred.w;
+    int num_grid_y = pred.h;
+
+    const int num_class = 80; // number of classes. 80 for COCO
+    const int reg_max_1 = (pred.c - num_class) / 4;
+
+    for (int i = 0; i < num_grid_y; i++)
+    {
+        for (int j = 0; j < num_grid_x; j++)
+        {
+            // find label with max score
+            int label = -1;
+            float score = -FLT_MAX;
+            for (int k = 0; k < num_class; k++)
+            {
+                float s = pred.channel(k).row(i)[j];
+                if (s > score)
+                {
+                    label = k;
+                    score = s;
+                }
+            }
+
+            score = sigmoid(score);
+
+            if (score >= prob_threshold)
+            {
+                ncnn::Mat bbox_pred(reg_max_1, 4);
+                for (int k = 0; k < reg_max_1 * 4; k++)
+                {
+                    bbox_pred[k] = pred.channel(num_class + k).row(i)[j];
+                }
+                {
+                    ncnn::Layer* softmax = ncnn::create_layer("Softmax");
+
+                    ncnn::ParamDict pd;
+                    pd.set(0, 1); // axis
+                    pd.set(1, 1);
+                    softmax->load_param(pd);
+
+                    ncnn::Option opt;
+                    opt.num_threads = 1;
+                    opt.use_packing_layout = false;
+
+                    softmax->create_pipeline(opt);
+
+                    softmax->forward_inplace(bbox_pred, opt);
+
+                    softmax->destroy_pipeline(opt);
+
+                    delete softmax;
+                }
+
+                float pred_ltrb[4];
+                for (int k = 0; k < 4; k++)
+                {
+                    float dis = 0.f;
+                    const float* dis_after_sm = bbox_pred.row(k);
+                    for (int l = 0; l < reg_max_1; l++)
+                    {
+                        dis += l * dis_after_sm[l];
+                    }
+
+                    pred_ltrb[k] = dis * stride;
+                }
+
+                float pb_cx = j * stride;
+                float pb_cy = i * stride;
+
+                float x0 = pb_cx - pred_ltrb[0];
+                float y0 = pb_cy - pred_ltrb[1];
+                float x1 = pb_cx + pred_ltrb[2];
+                float y1 = pb_cy + pred_ltrb[3];
+
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = x1 - x0;
+                obj.rect.height = y1 - y0;
+                obj.label = label;
+                obj.prob = score;
+
+                objects.push_back(obj);
+            }
+        }
+    }
+}
+
+static int detect_nanodet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net nanodet;
+
+    nanodet.opt.use_vulkan_compute = true;
+    // nanodet.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/RangiLyu/nanodet
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    //     nanodet.load_param("nanodet-plus-m_320.torchscript.ncnn.param");
+    //     nanodet.load_model("nanodet-plus-m_320.torchscript.ncnn.bin");
+    if (nanodet.load_param("nanodet-plus-m_416.torchscript.ncnn.param"))
+        exit(-1);
+    if (nanodet.load_model("nanodet-plus-m_416.torchscript.ncnn.bin"))
+        exit(-1);
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    //     const int target_size = 320;
+    const int target_size = 416;
+    const float prob_threshold = 0.4f;
+    const float nms_threshold = 0.5f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {103.53f, 116.28f, 123.675f};
+    const float norm_vals[3] = {0.017429f, 0.017507f, 0.017125f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = nanodet.create_extractor();
+
+    ex.input("in0", in_pad);
+
+    std::vector<Object> proposals;
+
+    // stride 8
+    {
+        ncnn::Mat pred;
+        ex.extract("231", pred);
+
+        std::vector<Object> objects8;
+        generate_proposals(pred, 8, in_pad, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat pred;
+        ex.extract("228", pred);
+
+        std::vector<Object> objects16;
+        generate_proposals(pred, 16, in_pad, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat pred;
+        ex.extract("225", pred);
+
+        std::vector<Object> objects32;
+        generate_proposals(pred, 32, in_pad, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // stride 64
+    {
+        ncnn::Mat pred;
+        ex.extract("222", pred);
+
+        std::vector<Object> objects64;
+        generate_proposals(pred, 64, in_pad, prob_threshold, objects64);
+
+        proposals.insert(proposals.end(), objects64.begin(), objects64.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(width - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(height - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(width - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(height - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_nanodet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/p2pnet.cpp b/examples/p2pnet.cpp
new file mode 100644
index 0000000..cee3077
--- /dev/null
+++ b/examples/p2pnet.cpp
@@ -0,0 +1,242 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct CrowdPoint
+{
+    cv::Point pt;
+    float prob;
+};
+
+static void shift(int w, int h, int stride, std::vector<float> anchor_points, std::vector<float>& shifted_anchor_points)
+{
+    std::vector<float> x_, y_;
+    for (int i = 0; i < w; i++)
+    {
+        float x = (i + 0.5) * stride;
+        x_.push_back(x);
+    }
+    for (int i = 0; i < h; i++)
+    {
+        float y = (i + 0.5) * stride;
+        y_.push_back(y);
+    }
+
+    std::vector<float> shift_x((size_t)w * h, 0), shift_y((size_t)w * h, 0);
+    for (int i = 0; i < h; i++)
+    {
+        for (int j = 0; j < w; j++)
+        {
+            shift_x[i * w + j] = x_[j];
+        }
+    }
+    for (int i = 0; i < h; i++)
+    {
+        for (int j = 0; j < w; j++)
+        {
+            shift_y[i * w + j] = y_[i];
+        }
+    }
+
+    std::vector<float> shifts((size_t)w * h * 2, 0);
+    for (int i = 0; i < w * h; i++)
+    {
+        shifts[i * 2] = shift_x[i];
+        shifts[i * 2 + 1] = shift_y[i];
+    }
+
+    shifted_anchor_points.resize((size_t)2 * w * h * anchor_points.size() / 2, 0);
+    for (int i = 0; i < w * h; i++)
+    {
+        for (int j = 0; j < anchor_points.size() / 2; j++)
+        {
+            float x = anchor_points[j * 2] + shifts[i * 2];
+            float y = anchor_points[j * 2 + 1] + shifts[i * 2 + 1];
+            shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2] = x;
+            shifted_anchor_points[i * anchor_points.size() / 2 * 2 + j * 2 + 1] = y;
+        }
+    }
+}
+static void generate_anchor_points(int stride, int row, int line, std::vector<float>& anchor_points)
+{
+    float row_step = (float)stride / row;
+    float line_step = (float)stride / line;
+
+    std::vector<float> x_, y_;
+    for (int i = 1; i < line + 1; i++)
+    {
+        float x = (i - 0.5) * line_step - stride / 2;
+        x_.push_back(x);
+    }
+    for (int i = 1; i < row + 1; i++)
+    {
+        float y = (i - 0.5) * row_step - stride / 2;
+        y_.push_back(y);
+    }
+    std::vector<float> shift_x((size_t)row * line, 0), shift_y((size_t)row * line, 0);
+    for (int i = 0; i < row; i++)
+    {
+        for (int j = 0; j < line; j++)
+        {
+            shift_x[i * line + j] = x_[j];
+        }
+    }
+    for (int i = 0; i < row; i++)
+    {
+        for (int j = 0; j < line; j++)
+        {
+            shift_y[i * line + j] = y_[i];
+        }
+    }
+    anchor_points.resize((size_t)row * line * 2, 0);
+    for (int i = 0; i < row * line; i++)
+    {
+        float x = shift_x[i];
+        float y = shift_y[i];
+        anchor_points[i * 2] = x;
+        anchor_points[i * 2 + 1] = y;
+    }
+}
+static void generate_anchor_points(int img_w, int img_h, std::vector<int> pyramid_levels, int row, int line, std::vector<float>& all_anchor_points)
+{
+    std::vector<std::pair<int, int> > image_shapes;
+    std::vector<int> strides;
+    for (int i = 0; i < pyramid_levels.size(); i++)
+    {
+        int new_h = std::floor((img_h + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i]));
+        int new_w = std::floor((img_w + std::pow(2, pyramid_levels[i]) - 1) / std::pow(2, pyramid_levels[i]));
+        image_shapes.push_back(std::make_pair(new_w, new_h));
+        strides.push_back(std::pow(2, pyramid_levels[i]));
+    }
+
+    all_anchor_points.clear();
+    for (int i = 0; i < pyramid_levels.size(); i++)
+    {
+        std::vector<float> anchor_points;
+        generate_anchor_points(std::pow(2, pyramid_levels[i]), row, line, anchor_points);
+        std::vector<float> shifted_anchor_points;
+        shift(image_shapes[i].first, image_shapes[i].second, strides[i], anchor_points, shifted_anchor_points);
+        all_anchor_points.insert(all_anchor_points.end(), shifted_anchor_points.begin(), shifted_anchor_points.end());
+    }
+}
+
+static int detect_crowd(const cv::Mat& bgr, std::vector<CrowdPoint>& crowd_points)
+{
+    ncnn::Option opt;
+    opt.num_threads = 4;
+    opt.use_vulkan_compute = false;
+    opt.use_bf16_storage = false;
+
+    ncnn::Net net;
+    net.opt = opt;
+
+    // model is converted from
+    // https://github.com/TencentYoutuResearch/CrowdCounting-P2PNet
+    // the ncnn model  https://pan.baidu.com/s/1O1CBgvY6yJkrK8Npxx3VMg pwd: ezhx
+    if (net.load_param("p2pnet.param"))
+        exit(-1);
+    if (net.load_model("p2pnet.bin"))
+        exit(-1);
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    int new_width = width / 128 * 128;
+    int new_height = height / 128 * 128;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, new_width, new_height);
+
+    std::vector<int> pyramid_levels(1, 3);
+    std::vector<float> all_anchor_points;
+    generate_anchor_points(in.w, in.h, pyramid_levels, 2, 2, all_anchor_points);
+
+    ncnn::Mat anchor_points = ncnn::Mat(2, all_anchor_points.size() / 2, all_anchor_points.data());
+
+    ncnn::Extractor ex = net.create_extractor();
+    const float mean_vals1[3] = {123.675f, 116.28f, 103.53f};
+    const float norm_vals1[3] = {0.01712475f, 0.0175f, 0.01742919f};
+
+    in.substract_mean_normalize(mean_vals1, norm_vals1);
+
+    ex.input("input", in);
+    ex.input("anchor", anchor_points);
+
+    ncnn::Mat score, points;
+    ex.extract("pred_scores", score);
+    ex.extract("pred_points", points);
+
+    for (int i = 0; i < points.h; i++)
+    {
+        float* score_data = score.row(i);
+        float* points_data = points.row(i);
+        CrowdPoint cp;
+        int x = points_data[0] / new_width * width;
+        int y = points_data[1] / new_height * height;
+        cp.pt = cv::Point(x, y);
+        cp.prob = score_data[1];
+        crowd_points.push_back(cp);
+    }
+
+    return 0;
+}
+
+static void draw_result(const cv::Mat& bgr, const std::vector<CrowdPoint>& crowd_points)
+{
+    cv::Mat image = bgr.clone();
+    const float threshold = 0.5f;
+    for (int i = 0; i < crowd_points.size(); i++)
+    {
+        if (crowd_points[i].prob > threshold)
+        {
+            cv::circle(image, crowd_points[i].pt, 4, cv::Scalar(0, 0, 255), -1, 8, 0);
+        }
+    }
+    cv::imshow("image", image);
+    cv::waitKey();
+}
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat bgr = cv::imread(imagepath, 1);
+    if (bgr.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<CrowdPoint> crowd_points;
+    detect_crowd(bgr, crowd_points);
+    draw_result(bgr, crowd_points);
+
+    return 0;
+}
diff --git a/examples/peleenetssd_seg.cpp b/examples/peleenetssd_seg.cpp
new file mode 100644
index 0000000..84dc4d6
--- /dev/null
+++ b/examples/peleenetssd_seg.cpp
@@ -0,0 +1,198 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_peleenet(const cv::Mat& bgr, std::vector<Object>& objects, ncnn::Mat& resized)
+{
+    ncnn::Net peleenet;
+
+    peleenet.opt.use_vulkan_compute = true;
+
+    // model is converted from https://github.com/eric612/MobileNet-YOLO
+    // and can be downloaded from https://drive.google.com/open?id=1Wt6jKv13sBRMHgrGAJYlOlRF-o80pC0g
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (peleenet.load_param("pelee.param"))
+        exit(-1);
+    if (peleenet.load_model("pelee.bin"))
+        exit(-1);
+
+    const int target_size = 304;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {103.9f, 116.7f, 123.6f};
+    const float norm_vals[3] = {0.017f, 0.017f, 0.017f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = peleenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+    ncnn::Mat seg_out;
+    ex.extract("sigmoid", seg_out);
+    resize_bilinear(seg_out, resized, img_w, img_h);
+    //resize_bicubic(seg_out,resized,img_w,img_h); // sharpness
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, ncnn::Mat map)
+{
+    static const char* class_names[] = {"background",
+                                        "person", "rider", "car", "bus",
+                                        "truck", "bike", "motor",
+                                        "traffic light", "traffic sign", "train"
+                                       };
+
+    cv::Mat image = bgr.clone();
+    const int color[] = {128, 255, 128, 244, 35, 232};
+    const int color_count = sizeof(color) / sizeof(int);
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+    int width = map.w;
+    int height = map.h;
+    int size = map.c;
+    int img_index2 = 0;
+    float threshold = 0.45;
+    const float* ptr2 = map;
+    for (int i = 0; i < height; i++)
+    {
+        unsigned char* ptr1 = image.ptr<unsigned char>(i);
+        int img_index1 = 0;
+        for (int j = 0; j < width; j++)
+        {
+            float maxima = threshold;
+            int index = -1;
+            for (int c = 0; c < size; c++)
+            {
+                //const float* ptr3 = map.channel(c);
+                const float* ptr3 = ptr2 + c * width * height;
+                if (ptr3[img_index2] > maxima)
+                {
+                    maxima = ptr3[img_index2];
+                    index = c;
+                }
+            }
+            if (index > -1)
+            {
+                int color_index = (index)*3;
+                if (color_index < color_count)
+                {
+                    int b = color[color_index];
+                    int g = color[color_index + 1];
+                    int r = color[color_index + 2];
+                    ptr1[img_index1] = b / 2 + ptr1[img_index1] / 2;
+                    ptr1[img_index1 + 1] = g / 2 + ptr1[img_index1 + 1] / 2;
+                    ptr1[img_index1 + 2] = r / 2 + ptr1[img_index1 + 2] / 2;
+                }
+            }
+            img_index1 += 3;
+            img_index2++;
+        }
+    }
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    ncnn::Mat seg_out;
+    detect_peleenet(m, objects, seg_out);
+
+    draw_objects(m, objects, seg_out);
+
+    return 0;
+}
diff --git a/examples/retinaface.cpp b/examples/retinaface.cpp
new file mode 100644
index 0000000..e7f84e5
--- /dev/null
+++ b/examples/retinaface.cpp
@@ -0,0 +1,436 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct FaceObject
+{
+    cv::Rect_<float> rect;
+    cv::Point2f landmark[5];
+    float prob;
+};
+
+static inline float intersection_area(const FaceObject& a, const FaceObject& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const FaceObject& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const FaceObject& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+// copy from src/layer/proposal.cpp
+static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
+{
+    int num_ratio = ratios.w;
+    int num_scale = scales.w;
+
+    ncnn::Mat anchors;
+    anchors.create(4, num_ratio * num_scale);
+
+    const float cx = base_size * 0.5f;
+    const float cy = base_size * 0.5f;
+
+    for (int i = 0; i < num_ratio; i++)
+    {
+        float ar = ratios[i];
+
+        int r_w = round(base_size / sqrt(ar));
+        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));
+
+        for (int j = 0; j < num_scale; j++)
+        {
+            float scale = scales[j];
+
+            float rs_w = r_w * scale;
+            float rs_h = r_h * scale;
+
+            float* anchor = anchors.row(i * num_scale + j);
+
+            anchor[0] = cx - rs_w * 0.5f;
+            anchor[1] = cy - rs_h * 0.5f;
+            anchor[2] = cx + rs_w * 0.5f;
+            anchor[3] = cy + rs_h * 0.5f;
+        }
+    }
+
+    return anchors;
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, const ncnn::Mat& landmark_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
+{
+    int w = score_blob.w;
+    int h = score_blob.h;
+
+    // generate face proposal from bbox deltas and shifted anchors
+    const int num_anchors = anchors.h;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float* anchor = anchors.row(q);
+
+        const ncnn::Mat score = score_blob.channel(q + num_anchors);
+        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);
+        const ncnn::Mat landmark = landmark_blob.channel_range(q * 10, 10);
+
+        // shifted anchor
+        float anchor_y = anchor[1];
+
+        float anchor_w = anchor[2] - anchor[0];
+        float anchor_h = anchor[3] - anchor[1];
+
+        for (int i = 0; i < h; i++)
+        {
+            float anchor_x = anchor[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                int index = i * w + j;
+
+                float prob = score[index];
+
+                if (prob >= prob_threshold)
+                {
+                    // apply center size
+                    float dx = bbox.channel(0)[index];
+                    float dy = bbox.channel(1)[index];
+                    float dw = bbox.channel(2)[index];
+                    float dh = bbox.channel(3)[index];
+
+                    float cx = anchor_x + anchor_w * 0.5f;
+                    float cy = anchor_y + anchor_h * 0.5f;
+
+                    float pb_cx = cx + anchor_w * dx;
+                    float pb_cy = cy + anchor_h * dy;
+
+                    float pb_w = anchor_w * exp(dw);
+                    float pb_h = anchor_h * exp(dh);
+
+                    float x0 = pb_cx - pb_w * 0.5f;
+                    float y0 = pb_cy - pb_h * 0.5f;
+                    float x1 = pb_cx + pb_w * 0.5f;
+                    float y1 = pb_cy + pb_h * 0.5f;
+
+                    FaceObject obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0 + 1;
+                    obj.rect.height = y1 - y0 + 1;
+                    obj.landmark[0].x = cx + (anchor_w + 1) * landmark.channel(0)[index];
+                    obj.landmark[0].y = cy + (anchor_h + 1) * landmark.channel(1)[index];
+                    obj.landmark[1].x = cx + (anchor_w + 1) * landmark.channel(2)[index];
+                    obj.landmark[1].y = cy + (anchor_h + 1) * landmark.channel(3)[index];
+                    obj.landmark[2].x = cx + (anchor_w + 1) * landmark.channel(4)[index];
+                    obj.landmark[2].y = cy + (anchor_h + 1) * landmark.channel(5)[index];
+                    obj.landmark[3].x = cx + (anchor_w + 1) * landmark.channel(6)[index];
+                    obj.landmark[3].y = cy + (anchor_h + 1) * landmark.channel(7)[index];
+                    obj.landmark[4].x = cx + (anchor_w + 1) * landmark.channel(8)[index];
+                    obj.landmark[4].y = cy + (anchor_h + 1) * landmark.channel(9)[index];
+                    obj.prob = prob;
+
+                    faceobjects.push_back(obj);
+                }
+
+                anchor_x += feat_stride;
+            }
+
+            anchor_y += feat_stride;
+        }
+    }
+}
+
+static int detect_retinaface(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
+{
+    ncnn::Net retinaface;
+
+    retinaface.opt.use_vulkan_compute = true;
+
+    // model is converted from
+    // https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models
+    // https://github.com/deepinsight/insightface/issues/669
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    //     retinaface.load_param("retinaface-R50.param");
+    //     retinaface.load_model("retinaface-R50.bin");
+    if (retinaface.load_param("mnet.25-opt.param"))
+        exit(-1);
+    if (retinaface.load_model("mnet.25-opt.bin"))
+        exit(-1);
+
+    const float prob_threshold = 0.8f;
+    const float nms_threshold = 0.4f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h);
+
+    ncnn::Extractor ex = retinaface.create_extractor();
+
+    ex.input("data", in);
+
+    std::vector<FaceObject> faceproposals;
+
+    // stride 32
+    {
+        ncnn::Mat score_blob, bbox_blob, landmark_blob;
+        ex.extract("face_rpn_cls_prob_reshape_stride32", score_blob);
+        ex.extract("face_rpn_bbox_pred_stride32", bbox_blob);
+        ex.extract("face_rpn_landmark_pred_stride32", landmark_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 32;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 32.f;
+        scales[1] = 16.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects32;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects32);
+
+        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat score_blob, bbox_blob, landmark_blob;
+        ex.extract("face_rpn_cls_prob_reshape_stride16", score_blob);
+        ex.extract("face_rpn_bbox_pred_stride16", bbox_blob);
+        ex.extract("face_rpn_landmark_pred_stride16", landmark_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 16;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 8.f;
+        scales[1] = 4.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects16;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects16);
+
+        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
+    }
+
+    // stride 8
+    {
+        ncnn::Mat score_blob, bbox_blob, landmark_blob;
+        ex.extract("face_rpn_cls_prob_reshape_stride8", score_blob);
+        ex.extract("face_rpn_bbox_pred_stride8", bbox_blob);
+        ex.extract("face_rpn_landmark_pred_stride8", landmark_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 8;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 2.f;
+        scales[1] = 1.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, landmark_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(faceproposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(faceproposals, picked, nms_threshold);
+
+    int face_count = picked.size();
+
+    faceobjects.resize(face_count);
+    for (int i = 0; i < face_count; i++)
+    {
+        faceobjects[i] = faceproposals[picked[i]];
+
+        // clip to image size
+        float x0 = faceobjects[i].rect.x;
+        float y0 = faceobjects[i].rect.y;
+        float x1 = x0 + faceobjects[i].rect.width;
+        float y1 = y0 + faceobjects[i].rect.height;
+
+        x0 = std::max(std::min(x0, (float)img_w - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)img_h - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)img_w - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)img_h - 1), 0.f);
+
+        faceobjects[i].rect.x = x0;
+        faceobjects[i].rect.y = y0;
+        faceobjects[i].rect.width = x1 - x0;
+        faceobjects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
+{
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < faceobjects.size(); i++)
+    {
+        const FaceObject& obj = faceobjects[i];
+
+        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
+
+        cv::circle(image, obj.landmark[0], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[1], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[2], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[3], 2, cv::Scalar(0, 255, 255), -1);
+        cv::circle(image, obj.landmark[4], 2, cv::Scalar(0, 255, 255), -1);
+
+        char text[256];
+        sprintf(text, "%.1f%%", obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<FaceObject> faceobjects;
+    detect_retinaface(m, faceobjects);
+
+    draw_faceobjects(m, faceobjects);
+
+    return 0;
+}
diff --git a/examples/rfcn.cpp b/examples/rfcn.cpp
new file mode 100644
index 0000000..9854647
--- /dev/null
+++ b/examples/rfcn.cpp
@@ -0,0 +1,362 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <math.h>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static int detect_rfcn(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net rfcn;
+
+    rfcn.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/YuwenXiong/py-R-FCN
+    // https://github.com/YuwenXiong/py-R-FCN/blob/master/models/pascal_voc/ResNet-50/rfcn_end2end/test_agnostic.prototxt
+    // https://1drv.ms/u/s!AoN7vygOjLIQqUWHpY67oaC7mopf
+    // resnet50_rfcn_final.caffemodel
+    if (rfcn.load_param("rfcn_end2end.param"))
+        exit(-1);
+    if (rfcn.load_model("rfcn_end2end.bin"))
+        exit(-1);
+
+    const int target_size = 224;
+
+    const int max_per_image = 100;
+    const float confidence_thresh = 0.6f; // CONF_THRESH
+
+    const float nms_threshold = 0.3f; // NMS_THRESH
+
+    // scale to target detect size
+    int w = bgr.cols;
+    int h = bgr.rows;
+    float scale = 1.f;
+    if (w < h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, w, h);
+
+    const float mean_vals[3] = {102.9801f, 115.9465f, 122.7717f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Mat im_info(3);
+    im_info[0] = h;
+    im_info[1] = w;
+    im_info[2] = scale;
+
+    // step1, extract feature and all rois
+    ncnn::Extractor ex1 = rfcn.create_extractor();
+
+    ex1.input("data", in);
+    ex1.input("im_info", im_info);
+
+    ncnn::Mat rfcn_cls;
+    ncnn::Mat rfcn_bbox;
+    ncnn::Mat rois; // all rois
+    ex1.extract("rfcn_cls", rfcn_cls);
+    ex1.extract("rfcn_bbox", rfcn_bbox);
+    ex1.extract("rois", rois);
+
+    // step2, extract bbox and score for each roi
+    std::vector<std::vector<Object> > class_candidates;
+    for (int i = 0; i < rois.c; i++)
+    {
+        ncnn::Extractor ex2 = rfcn.create_extractor();
+
+        ncnn::Mat roi = rois.channel(i); // get single roi
+        ex2.input("rfcn_cls", rfcn_cls);
+        ex2.input("rfcn_bbox", rfcn_bbox);
+        ex2.input("rois", roi);
+
+        ncnn::Mat bbox_pred;
+        ncnn::Mat cls_prob;
+        ex2.extract("bbox_pred", bbox_pred);
+        ex2.extract("cls_prob", cls_prob);
+
+        int num_class = cls_prob.w;
+        class_candidates.resize(num_class);
+
+        // find class id with highest score
+        int label = 0;
+        float score = 0.f;
+        for (int i = 0; i < num_class; i++)
+        {
+            float class_score = cls_prob[i];
+            if (class_score > score)
+            {
+                label = i;
+                score = class_score;
+            }
+        }
+
+        // ignore background or low score
+        if (label == 0 || score <= confidence_thresh)
+            continue;
+
+        //         fprintf(stderr, "%d = %f\n", label, score);
+
+        // unscale to image size
+        float x1 = roi[0] / scale;
+        float y1 = roi[1] / scale;
+        float x2 = roi[2] / scale;
+        float y2 = roi[3] / scale;
+
+        float pb_w = x2 - x1 + 1;
+        float pb_h = y2 - y1 + 1;
+
+        // apply bbox regression
+        float dx = bbox_pred[4];
+        float dy = bbox_pred[4 + 1];
+        float dw = bbox_pred[4 + 2];
+        float dh = bbox_pred[4 + 3];
+
+        float cx = x1 + pb_w * 0.5f;
+        float cy = y1 + pb_h * 0.5f;
+
+        float obj_cx = cx + pb_w * dx;
+        float obj_cy = cy + pb_h * dy;
+
+        float obj_w = pb_w * exp(dw);
+        float obj_h = pb_h * exp(dh);
+
+        float obj_x1 = obj_cx - obj_w * 0.5f;
+        float obj_y1 = obj_cy - obj_h * 0.5f;
+        float obj_x2 = obj_cx + obj_w * 0.5f;
+        float obj_y2 = obj_cy + obj_h * 0.5f;
+
+        // clip
+        obj_x1 = std::max(std::min(obj_x1, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2, (float)(bgr.rows - 1)), 0.f);
+
+        // append object
+        Object obj;
+        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
+        obj.label = label;
+        obj.prob = score;
+
+        class_candidates[label].push_back(obj);
+    }
+
+    // post process
+    objects.clear();
+    for (int i = 0; i < (int)class_candidates.size(); i++)
+    {
+        std::vector<Object>& candidates = class_candidates[i];
+
+        qsort_descent_inplace(candidates);
+
+        std::vector<int> picked;
+        nms_sorted_bboxes(candidates, picked, nms_threshold);
+
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            int z = picked[j];
+            objects.push_back(candidates[z]);
+        }
+    }
+
+    qsort_descent_inplace(objects);
+
+    if (max_per_image > 0 && max_per_image < objects.size())
+    {
+        objects.resize(max_per_image);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_rfcn(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/rvm.cpp b/examples/rvm.cpp
new file mode 100644
index 0000000..7e12a60
--- /dev/null
+++ b/examples/rvm.cpp
@@ -0,0 +1,134 @@
+#include "net.h"
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+static void draw_objects(const cv::Mat& bgr, const cv::Mat& fgr, const cv::Mat& pha)
+{
+    cv::Mat fgr8U;
+    fgr.convertTo(fgr8U, CV_8UC3, 255.0, 0);
+    cv::Mat pha8U;
+    pha.convertTo(pha8U, CV_8UC1, 255.0, 0);
+
+    cv::Mat comp;
+    cv::resize(bgr, comp, pha.size(), 0, 0, 1);
+    for (int i = 0; i < pha8U.rows; i++)
+    {
+        for (int j = 0; j < pha8U.cols; j++)
+        {
+            uchar data = pha8U.at<uchar>(i, j);
+            float alpha = (float)data / 255;
+            comp.at<cv::Vec3b>(i, j)[0] = fgr8U.at<cv::Vec3b>(i, j)[0] * alpha + (1 - alpha) * 155;
+            comp.at<cv::Vec3b>(i, j)[1] = fgr8U.at<cv::Vec3b>(i, j)[1] * alpha + (1 - alpha) * 255;
+            comp.at<cv::Vec3b>(i, j)[2] = fgr8U.at<cv::Vec3b>(i, j)[2] * alpha + (1 - alpha) * 120;
+        }
+    }
+
+    cv::imshow("pha", pha8U);
+    cv::imshow("fgr", fgr8U);
+    cv::imshow("comp", comp);
+    cv::waitKey(0);
+}
+static int detect_rvm(const cv::Mat& bgr, cv::Mat& pha, cv::Mat& fgr)
+{
+    const float downsample_ratio = 0.5f;
+    const int target_width = 512;
+    const int target_height = 512;
+
+    ncnn::Net net;
+    net.opt.use_vulkan_compute = false;
+    //original pretrained model from https://github.com/PeterL1n/RobustVideoMatting
+    //ncnn model https://pan.baidu.com/s/11iEY2RGfzWFtce8ue7T3JQ password: d9t6
+    if (net.load_param("rvm_512.param"))
+        exit(-1);
+    if (net.load_model("rvm_512.bin"))
+        exit(-1);
+
+    //if you use another input size,pleaze change input shape
+    ncnn::Mat r1i = ncnn::Mat(128, 128, 16);
+    ncnn::Mat r2i = ncnn::Mat(64, 64, 20);
+    ncnn::Mat r3i = ncnn::Mat(32, 32, 40);
+    ncnn::Mat r4i = ncnn::Mat(16, 16, 64);
+    r1i.fill(0.0f);
+    r2i.fill(0.0f);
+    r3i.fill(0.0f);
+    r4i.fill(0.0f);
+
+    ncnn::Extractor ex = net.create_extractor();
+    const float mean_vals1[3] = {123.675f, 116.28f, 103.53f};
+    const float norm_vals1[3] = {0.01712475f, 0.0175f, 0.01742919f};
+    const float mean_vals2[3] = {0, 0, 0};
+    const float norm_vals2[3] = {1 / 255.0, 1 / 255.0, 1 / 255.0};
+    ncnn::Mat ncnn_in2 = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_width, target_height);
+    ncnn::Mat ncnn_in1 = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_width * downsample_ratio, target_height * downsample_ratio);
+
+    ncnn_in1.substract_mean_normalize(mean_vals1, norm_vals1);
+    ncnn_in2.substract_mean_normalize(mean_vals2, norm_vals2);
+
+    ex.input("src1", ncnn_in1);
+    ex.input("src2", ncnn_in2);
+    ex.input("r1i", r1i);
+    ex.input("r2i", r2i);
+    ex.input("r3i", r3i);
+    ex.input("r4i", r4i);
+
+    //if use video matting,these output will be input of next infer
+    ex.extract("r4o", r4i);
+    ex.extract("r3o", r3i);
+    ex.extract("r2o", r2i);
+    ex.extract("r1o", r1i);
+
+    ncnn::Mat pha_;
+    ex.extract("pha", pha_);
+    ncnn::Mat fgr_;
+    ex.extract("fgr", fgr_);
+
+    cv::Mat cv_pha = cv::Mat(pha_.h, pha_.w, CV_32FC1, (float*)pha_.data);
+    cv::Mat cv_fgr = cv::Mat(fgr_.h, fgr_.w, CV_32FC3);
+    float* fgr_data = (float*)fgr_.data;
+    for (int i = 0; i < fgr_.h; i++)
+    {
+        for (int j = 0; j < fgr_.w; j++)
+        {
+            cv_fgr.at<cv::Vec3f>(i, j)[2] = fgr_data[0 * fgr_.h * fgr_.w + i * fgr_.w + j];
+            cv_fgr.at<cv::Vec3f>(i, j)[1] = fgr_data[1 * fgr_.h * fgr_.w + i * fgr_.w + j];
+            cv_fgr.at<cv::Vec3f>(i, j)[0] = fgr_data[2 * fgr_.h * fgr_.w + i * fgr_.w + j];
+        }
+    }
+
+    cv_pha.copyTo(pha);
+    cv_fgr.copyTo(fgr);
+
+    return 0;
+}
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    cv::Mat fgr, pha;
+    detect_rvm(m, pha, fgr);
+    draw_objects(m, fgr, pha);
+
+    return 0;
+}
diff --git a/examples/scrfd.cpp b/examples/scrfd.cpp
new file mode 100644
index 0000000..8b06ecb
--- /dev/null
+++ b/examples/scrfd.cpp
@@ -0,0 +1,436 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct FaceObject
+{
+    cv::Rect_<float> rect;
+    float prob;
+};
+
+static inline float intersection_area(const FaceObject& a, const FaceObject& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const FaceObject& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const FaceObject& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+// insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors()
+static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
+{
+    int num_ratio = ratios.w;
+    int num_scale = scales.w;
+
+    ncnn::Mat anchors;
+    anchors.create(4, num_ratio * num_scale);
+
+    const float cx = 0;
+    const float cy = 0;
+
+    for (int i = 0; i < num_ratio; i++)
+    {
+        float ar = ratios[i];
+
+        int r_w = round(base_size / sqrt(ar));
+        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));
+
+        for (int j = 0; j < num_scale; j++)
+        {
+            float scale = scales[j];
+
+            float rs_w = r_w * scale;
+            float rs_h = r_h * scale;
+
+            float* anchor = anchors.row(i * num_scale + j);
+
+            anchor[0] = cx - rs_w * 0.5f;
+            anchor[1] = cy - rs_h * 0.5f;
+            anchor[2] = cx + rs_w * 0.5f;
+            anchor[3] = cy + rs_h * 0.5f;
+        }
+    }
+
+    return anchors;
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
+{
+    int w = score_blob.w;
+    int h = score_blob.h;
+
+    // generate face proposal from bbox deltas and shifted anchors
+    const int num_anchors = anchors.h;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float* anchor = anchors.row(q);
+
+        const ncnn::Mat score = score_blob.channel(q);
+        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);
+
+        // shifted anchor
+        float anchor_y = anchor[1];
+
+        float anchor_w = anchor[2] - anchor[0];
+        float anchor_h = anchor[3] - anchor[1];
+
+        for (int i = 0; i < h; i++)
+        {
+            float anchor_x = anchor[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                int index = i * w + j;
+
+                float prob = score[index];
+
+                if (prob >= prob_threshold)
+                {
+                    // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single()
+                    float dx = bbox.channel(0)[index] * feat_stride;
+                    float dy = bbox.channel(1)[index] * feat_stride;
+                    float dw = bbox.channel(2)[index] * feat_stride;
+                    float dh = bbox.channel(3)[index] * feat_stride;
+
+                    // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox()
+                    float cx = anchor_x + anchor_w * 0.5f;
+                    float cy = anchor_y + anchor_h * 0.5f;
+
+                    float x0 = cx - dx;
+                    float y0 = cy - dy;
+                    float x1 = cx + dw;
+                    float y1 = cy + dh;
+
+                    FaceObject obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0 + 1;
+                    obj.rect.height = y1 - y0 + 1;
+                    obj.prob = prob;
+
+                    faceobjects.push_back(obj);
+                }
+
+                anchor_x += feat_stride;
+            }
+
+            anchor_y += feat_stride;
+        }
+    }
+}
+
+static int detect_scrfd(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
+{
+    ncnn::Net scrfd;
+
+    scrfd.opt.use_vulkan_compute = true;
+
+    // model is converted from
+    // https://github.com/deepinsight/insightface/tree/master/detection/scrfd
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (scrfd.load_param("scrfd_500m-opt2.param"))
+        exit(-1);
+    if (scrfd.load_model("scrfd_500m-opt2.bin"))
+        exit(-1);
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    // insightface/detection/scrfd/configs/scrfd/scrfd_500m.py
+    const int target_size = 640;
+    const float prob_threshold = 0.3f;
+    const float nms_threshold = 0.45f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = scrfd.create_extractor();
+
+    ex.input("input.1", in_pad);
+
+    std::vector<FaceObject> faceproposals;
+
+    // stride 32
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("412", score_blob);
+        ex.extract("415", bbox_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 8;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 1.f;
+        scales[1] = 2.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects32;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32);
+
+        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("474", score_blob);
+        ex.extract("477", bbox_blob);
+
+        const int base_size = 64;
+        const int feat_stride = 16;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 1.f;
+        scales[1] = 2.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects16;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16);
+
+        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
+    }
+
+    // stride 8
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("536", score_blob);
+        ex.extract("539", bbox_blob);
+
+        const int base_size = 256;
+        const int feat_stride = 32;
+        ncnn::Mat ratios(1);
+        ratios[0] = 1.f;
+        ncnn::Mat scales(2);
+        scales[0] = 1.f;
+        scales[1] = 2.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(faceproposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(faceproposals, picked, nms_threshold);
+
+    int face_count = picked.size();
+
+    faceobjects.resize(face_count);
+    for (int i = 0; i < face_count; i++)
+    {
+        faceobjects[i] = faceproposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale;
+
+        x0 = std::max(std::min(x0, (float)width - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)height - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)width - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)height - 1), 0.f);
+
+        faceobjects[i].rect.x = x0;
+        faceobjects[i].rect.y = y0;
+        faceobjects[i].rect.width = x1 - x0;
+        faceobjects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
+{
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < faceobjects.size(); i++)
+    {
+        const FaceObject& obj = faceobjects[i];
+
+        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
+
+        char text[256];
+        sprintf(text, "%.1f%%", obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<FaceObject> faceobjects;
+    detect_scrfd(m, faceobjects);
+
+    draw_faceobjects(m, faceobjects);
+
+    return 0;
+}
diff --git a/examples/scrfd_crowdhuman.cpp b/examples/scrfd_crowdhuman.cpp
new file mode 100644
index 0000000..7a4d683
--- /dev/null
+++ b/examples/scrfd_crowdhuman.cpp
@@ -0,0 +1,473 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct FaceObject
+{
+    cv::Rect_<float> rect;
+    float prob;
+};
+
+static inline float intersection_area(const FaceObject& a, const FaceObject& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<FaceObject>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<FaceObject>& faceobjects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const FaceObject& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const FaceObject& b = faceobjects[picked[j]];
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            //             float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+// insightface/detection/scrfd/mmdet/core/anchor/anchor_generator.py gen_single_level_base_anchors()
+static ncnn::Mat generate_anchors(int base_size, const ncnn::Mat& ratios, const ncnn::Mat& scales)
+{
+    int num_ratio = ratios.w;
+    int num_scale = scales.w;
+
+    ncnn::Mat anchors;
+    anchors.create(4, num_ratio * num_scale);
+
+    const float cx = 0;
+    const float cy = 0;
+
+    for (int i = 0; i < num_ratio; i++)
+    {
+        float ar = ratios[i];
+
+        int r_w = round(base_size / sqrt(ar));
+        int r_h = round(r_w * ar); //round(base_size * sqrt(ar));
+
+        for (int j = 0; j < num_scale; j++)
+        {
+            float scale = scales[j];
+
+            float rs_w = r_w * scale;
+            float rs_h = r_h * scale;
+
+            float* anchor = anchors.row(i * num_scale + j);
+
+            anchor[0] = cx - rs_w * 0.5f;
+            anchor[1] = cy - rs_h * 0.5f;
+            anchor[2] = cx + rs_w * 0.5f;
+            anchor[3] = cy + rs_h * 0.5f;
+        }
+    }
+
+    return anchors;
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int feat_stride, const ncnn::Mat& score_blob, const ncnn::Mat& bbox_blob, float prob_threshold, std::vector<FaceObject>& faceobjects)
+{
+    int w = score_blob.w;
+    int h = score_blob.h;
+
+    // generate face proposal from bbox deltas and shifted anchors
+    const int num_anchors = anchors.h;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float* anchor = anchors.row(q);
+
+        const ncnn::Mat score = score_blob.channel(q);
+        const ncnn::Mat bbox = bbox_blob.channel_range(q * 4, 4);
+
+        // shifted anchor
+        float anchor_y = anchor[1];
+
+        float anchor_w = anchor[2] - anchor[0];
+        float anchor_h = anchor[3] - anchor[1];
+
+        for (int i = 0; i < h; i++)
+        {
+            float anchor_x = anchor[0];
+
+            for (int j = 0; j < w; j++)
+            {
+                int index = i * w + j;
+
+                float prob = score[index];
+
+                if (prob >= prob_threshold)
+                {
+                    // insightface/detection/scrfd/mmdet/models/dense_heads/scrfd_head.py _get_bboxes_single()
+                    float dx = bbox.channel(0)[index] * feat_stride;
+                    float dy = bbox.channel(1)[index] * feat_stride;
+                    float dw = bbox.channel(2)[index] * feat_stride;
+                    float dh = bbox.channel(3)[index] * feat_stride;
+
+                    // insightface/detection/scrfd/mmdet/core/bbox/transforms.py distance2bbox()
+                    float cx = anchor_x + anchor_w * 0.5f;
+                    float cy = anchor_y + anchor_h * 0.5f;
+
+                    float x0 = cx - dx;
+                    float y0 = cy - dy;
+                    float x1 = cx + dw;
+                    float y1 = cy + dh;
+
+                    FaceObject obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0 + 1;
+                    obj.rect.height = y1 - y0 + 1;
+                    obj.prob = prob;
+
+                    faceobjects.push_back(obj);
+                }
+
+                anchor_x += feat_stride;
+            }
+
+            anchor_y += feat_stride;
+        }
+    }
+}
+
+static int detect_scrfd(const cv::Mat& bgr, std::vector<FaceObject>& faceobjects)
+{
+    ncnn::Net scrfd;
+
+    scrfd.opt.use_vulkan_compute = true;
+
+    // Insight face does not provided a trained scrfd_crowdhuman model
+    // but I have one for detecing cat face, you can have a try here:
+    // https://drive.google.com/file/d/1JogkKa0f_09HkENbCnXy9hRYxm35wKTn
+
+    if (scrfd.load_param("scrfd_crowdhuman.param"))
+        exit(-1);
+    if (scrfd.load_model("scrfd_crowdhuman.bin"))
+        exit(-1);
+
+    int width = bgr.cols;
+    int height = bgr.rows;
+
+    const int target_size = 640;
+    const float prob_threshold = 0.3f;
+    const float nms_threshold = 0.45f;
+
+    // pad to multiple of 32
+    int w = width;
+    int h = height;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, width, height, w, h);
+
+    // pad to target_size rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 0.f);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {1 / 128.f, 1 / 128.f, 1 / 128.f};
+    in_pad.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = scrfd.create_extractor();
+
+    ex.input("input.1", in_pad);
+
+    std::vector<FaceObject> faceproposals;
+
+    // stride 8
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("490", score_blob);
+        ex.extract("493", bbox_blob);
+
+        const int base_size = 8;
+        const int feat_stride = 8;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects32;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects32);
+
+        faceproposals.insert(faceproposals.end(), faceobjects32.begin(), faceobjects32.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("510", score_blob);
+        ex.extract("513", bbox_blob);
+
+        const int base_size = 16;
+        const int feat_stride = 16;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects16;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects16);
+
+        faceproposals.insert(faceproposals.end(), faceobjects16.begin(), faceobjects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat score_blob, bbox_blob;
+        ex.extract("530", score_blob);
+        ex.extract("533", bbox_blob);
+
+        const int base_size = 32;
+        const int feat_stride = 32;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // stride 64
+    {
+        ncnn::Mat score_blob, bbox_blob, kps_blob;
+        ex.extract("550", score_blob);
+        ex.extract("553", bbox_blob);
+
+        const int base_size = 64;
+        const int feat_stride = 64;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // stride 128
+    {
+        ncnn::Mat score_blob, bbox_blob, kps_blob;
+        ex.extract("570", score_blob);
+        ex.extract("573", bbox_blob);
+
+        const int base_size = 128;
+        const int feat_stride = 128;
+        ncnn::Mat ratios(1);
+        ratios[0] = 2.f;
+        ncnn::Mat scales(1);
+        scales[0] = 3.f;
+        ncnn::Mat anchors = generate_anchors(base_size, ratios, scales);
+
+        std::vector<FaceObject> faceobjects8;
+        generate_proposals(anchors, feat_stride, score_blob, bbox_blob, prob_threshold, faceobjects8);
+
+        faceproposals.insert(faceproposals.end(), faceobjects8.begin(), faceobjects8.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(faceproposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(faceproposals, picked, nms_threshold);
+
+    int face_count = picked.size();
+
+    faceobjects.resize(face_count);
+    for (int i = 0; i < face_count; i++)
+    {
+        faceobjects[i] = faceproposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (faceobjects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (faceobjects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (faceobjects[i].rect.x + faceobjects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (faceobjects[i].rect.y + faceobjects[i].rect.height - (hpad / 2)) / scale;
+
+        x0 = std::max(std::min(x0, (float)width - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)height - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)width - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)height - 1), 0.f);
+
+        faceobjects[i].rect.x = x0;
+        faceobjects[i].rect.y = y0;
+        faceobjects[i].rect.width = x1 - x0;
+        faceobjects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_faceobjects(const cv::Mat& bgr, const std::vector<FaceObject>& faceobjects)
+{
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < faceobjects.size(); i++)
+    {
+        const FaceObject& obj = faceobjects[i];
+
+        fprintf(stderr, "%.5f at %.2f %.2f %.2f x %.2f\n", obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(0, 255, 0));
+
+        char text[256];
+        sprintf(text, "%.1f%%", obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<FaceObject> faceobjects;
+    detect_scrfd(m, faceobjects);
+
+    draw_faceobjects(m, faceobjects);
+
+    return 0;
+}
diff --git a/examples/shufflenetv2.cpp b/examples/shufflenetv2.cpp
new file mode 100644
index 0000000..eaf3ec8
--- /dev/null
+++ b/examples/shufflenetv2.cpp
@@ -0,0 +1,125 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+static int detect_shufflenetv2(const cv::Mat& bgr, std::vector<float>& cls_scores)
+{
+    ncnn::Net shufflenetv2;
+
+    shufflenetv2.opt.use_vulkan_compute = true;
+
+    // https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe
+    // models can be downloaded from https://github.com/miaow1988/ShuffleNet_V2_pytorch_caffe/releases
+    if (shufflenetv2.load_param("shufflenet_v2_x0.5.param"))
+        exit(-1);
+    if (shufflenetv2.load_model("shufflenet_v2_x0.5.bin"))
+        exit(-1);
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 224, 224);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = shufflenetv2.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("fc", out);
+
+    // manually call softmax on the fc output
+    // convert result into probability
+    // skip if your model already has softmax operation
+    {
+        ncnn::Layer* softmax = ncnn::create_layer("Softmax");
+
+        ncnn::ParamDict pd;
+        softmax->load_param(pd);
+
+        softmax->forward_inplace(out, shufflenetv2.opt);
+
+        delete softmax;
+    }
+
+    out = out.reshape(out.w * out.h * out.c);
+
+    cls_scores.resize(out.w);
+    for (int j = 0; j < out.w; j++)
+    {
+        cls_scores[j] = out[j];
+    }
+
+    return 0;
+}
+
+static int print_topk(const std::vector<float>& cls_scores, int topk)
+{
+    // partial sort topk with index
+    int size = cls_scores.size();
+    std::vector<std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i = 0; i < size; i++)
+    {
+        vec[i] = std::make_pair(cls_scores[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                      std::greater<std::pair<float, int> >());
+
+    // print topk and score
+    for (int i = 0; i < topk; i++)
+    {
+        float score = vec[i].first;
+        int index = vec[i].second;
+        fprintf(stderr, "%d = %f\n", index, score);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<float> cls_scores;
+    detect_shufflenetv2(m, cls_scores);
+
+    print_topk(cls_scores, 3);
+
+    return 0;
+}
diff --git a/examples/simplepose.cpp b/examples/simplepose.cpp
new file mode 100644
index 0000000..867d54f
--- /dev/null
+++ b/examples/simplepose.cpp
@@ -0,0 +1,167 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct KeyPoint
+{
+    cv::Point2f p;
+    float prob;
+};
+
+static int detect_posenet(const cv::Mat& bgr, std::vector<KeyPoint>& keypoints)
+{
+    ncnn::Net posenet;
+
+    posenet.opt.use_vulkan_compute = true;
+
+    // the simple baseline human pose estimation from gluon-cv
+    // https://gluon-cv.mxnet.io/build/examples_pose/demo_simple_pose.html
+    // mxnet model exported via
+    //      pose_net.hybridize()
+    //      pose_net.export('pose')
+    // then mxnet2ncnn
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (posenet.load_param("pose.param"))
+        exit(-1);
+    if (posenet.load_model("pose.bin"))
+        exit(-1);
+
+    int w = bgr.cols;
+    int h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, w, h, 192, 256);
+
+    // transforms.ToTensor(),
+    // transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    // R' = (R / 255 - 0.485) / 0.229 = (R - 0.485 * 255) / 0.229 / 255
+    // G' = (G / 255 - 0.456) / 0.224 = (G - 0.456 * 255) / 0.224 / 255
+    // B' = (B / 255 - 0.406) / 0.225 = (B - 0.406 * 255) / 0.225 / 255
+    const float mean_vals[3] = {0.485f * 255.f, 0.456f * 255.f, 0.406f * 255.f};
+    const float norm_vals[3] = {1 / 0.229f / 255.f, 1 / 0.224f / 255.f, 1 / 0.225f / 255.f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = posenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("conv3_fwd", out);
+
+    // resolve point from heatmap
+    keypoints.clear();
+    for (int p = 0; p < out.c; p++)
+    {
+        const ncnn::Mat m = out.channel(p);
+
+        float max_prob = 0.f;
+        int max_x = 0;
+        int max_y = 0;
+        for (int y = 0; y < out.h; y++)
+        {
+            const float* ptr = m.row(y);
+            for (int x = 0; x < out.w; x++)
+            {
+                float prob = ptr[x];
+                if (prob > max_prob)
+                {
+                    max_prob = prob;
+                    max_x = x;
+                    max_y = y;
+                }
+            }
+        }
+
+        KeyPoint keypoint;
+        keypoint.p = cv::Point2f(max_x * w / (float)out.w, max_y * h / (float)out.h);
+        keypoint.prob = max_prob;
+
+        keypoints.push_back(keypoint);
+    }
+
+    return 0;
+}
+
+static void draw_pose(const cv::Mat& bgr, const std::vector<KeyPoint>& keypoints)
+{
+    cv::Mat image = bgr.clone();
+
+    // draw bone
+    static const int joint_pairs[16][2] = {
+        {0, 1}, {1, 3}, {0, 2}, {2, 4}, {5, 6}, {5, 7}, {7, 9}, {6, 8}, {8, 10}, {5, 11}, {6, 12}, {11, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}
+    };
+
+    for (int i = 0; i < 16; i++)
+    {
+        const KeyPoint& p1 = keypoints[joint_pairs[i][0]];
+        const KeyPoint& p2 = keypoints[joint_pairs[i][1]];
+
+        if (p1.prob < 0.2f || p2.prob < 0.2f)
+            continue;
+
+        cv::line(image, p1.p, p2.p, cv::Scalar(255, 0, 0), 2);
+    }
+
+    // draw joint
+    for (size_t i = 0; i < keypoints.size(); i++)
+    {
+        const KeyPoint& keypoint = keypoints[i];
+
+        fprintf(stderr, "%.2f %.2f = %.5f\n", keypoint.p.x, keypoint.p.y, keypoint.prob);
+
+        if (keypoint.prob < 0.2f)
+            continue;
+
+        cv::circle(image, keypoint.p, 3, cv::Scalar(0, 255, 0), -1);
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<KeyPoint> keypoints;
+    detect_posenet(m, keypoints);
+
+    draw_pose(m, keypoints);
+
+    return 0;
+}
diff --git a/examples/squeezencnn/README.md b/examples/squeezencnn/README.md
new file mode 100644
index 0000000..010eb41
--- /dev/null
+++ b/examples/squeezencnn/README.md
@@ -0,0 +1 @@
+The squeezenet android example project has been moved to https://github.com/nihui/ncnn-android-squeezenet
diff --git a/examples/squeezenet.cpp b/examples/squeezenet.cpp
new file mode 100644
index 0000000..a026c13
--- /dev/null
+++ b/examples/squeezenet.cpp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
+{
+    ncnn::Net squeezenet;
+
+    squeezenet.opt.use_vulkan_compute = true;
+
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (squeezenet.load_param("squeezenet_v1.1.param"))
+        exit(-1);
+    if (squeezenet.load_model("squeezenet_v1.1.bin"))
+        exit(-1);
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, 227, 227);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Extractor ex = squeezenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("prob", out);
+
+    cls_scores.resize(out.w);
+    for (int j = 0; j < out.w; j++)
+    {
+        cls_scores[j] = out[j];
+    }
+
+    return 0;
+}
+
+static int print_topk(const std::vector<float>& cls_scores, int topk)
+{
+    // partial sort topk with index
+    int size = cls_scores.size();
+    std::vector<std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i = 0; i < size; i++)
+    {
+        vec[i] = std::make_pair(cls_scores[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                      std::greater<std::pair<float, int> >());
+
+    // print topk and score
+    for (int i = 0; i < topk; i++)
+    {
+        float score = vec[i].first;
+        int index = vec[i].second;
+        fprintf(stderr, "%d = %f\n", index, score);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<float> cls_scores;
+    detect_squeezenet(m, cls_scores);
+
+    print_topk(cls_scores, 3);
+
+    return 0;
+}
diff --git a/examples/squeezenet_c_api.cpp b/examples/squeezenet_c_api.cpp
new file mode 100644
index 0000000..851a590
--- /dev/null
+++ b/examples/squeezenet_c_api.cpp
@@ -0,0 +1,123 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "c_api.h"
+
+#include <algorithm>
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector<float>& cls_scores)
+{
+    ncnn_net_t squeezenet = ncnn_net_create();
+
+    ncnn_option_t opt = ncnn_option_create();
+    ncnn_option_set_use_vulkan_compute(opt, 1);
+
+    ncnn_net_set_option(squeezenet, opt);
+
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (ncnn_net_load_param(squeezenet, "squeezenet_v1.1.param"))
+        exit(-1);
+    if (ncnn_net_load_model(squeezenet, "squeezenet_v1.1.bin"))
+        exit(-1);
+
+    ncnn_mat_t in = ncnn_mat_from_pixels_resize(bgr.data, NCNN_MAT_PIXEL_BGR, bgr.cols, bgr.rows, bgr.cols * 3, 227, 227, NULL);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    ncnn_mat_substract_mean_normalize(in, mean_vals, 0);
+
+    ncnn_extractor_t ex = ncnn_extractor_create(squeezenet);
+
+    ncnn_extractor_input(ex, "data", in);
+
+    ncnn_mat_t out;
+    ncnn_extractor_extract(ex, "prob", &out);
+
+    const int out_w = ncnn_mat_get_w(out);
+    const float* out_data = (const float*)ncnn_mat_get_data(out);
+
+    cls_scores.resize(out_w);
+    for (int j = 0; j < out_w; j++)
+    {
+        cls_scores[j] = out_data[j];
+    }
+
+    ncnn_mat_destroy(in);
+    ncnn_mat_destroy(out);
+
+    ncnn_extractor_destroy(ex);
+
+    ncnn_option_destroy(opt);
+
+    ncnn_net_destroy(squeezenet);
+
+    return 0;
+}
+
+static int print_topk(const std::vector<float>& cls_scores, int topk)
+{
+    // partial sort topk with index
+    int size = cls_scores.size();
+    std::vector<std::pair<float, int> > vec;
+    vec.resize(size);
+    for (int i = 0; i < size; i++)
+    {
+        vec[i] = std::make_pair(cls_scores[i], i);
+    }
+
+    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
+                      std::greater<std::pair<float, int> >());
+
+    // print topk and score
+    for (int i = 0; i < topk; i++)
+    {
+        float score = vec[i].first;
+        int index = vec[i].second;
+        fprintf(stderr, "%d = %f\n", index, score);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<float> cls_scores;
+    detect_squeezenet(m, cls_scores);
+
+    print_topk(cls_scores, 3);
+
+    return 0;
+}
diff --git a/examples/squeezenet_v1.1.bin b/examples/squeezenet_v1.1.bin
new file mode 100644
index 0000000..2b39bf8
--- /dev/null
+++ b/examples/squeezenet_v1.1.bin
diff --git a/examples/squeezenet_v1.1.caffemodel b/examples/squeezenet_v1.1.caffemodel
new file mode 100644
index 0000000..9d2fc33
--- /dev/null
+++ b/examples/squeezenet_v1.1.caffemodel
diff --git a/examples/squeezenet_v1.1.param b/examples/squeezenet_v1.1.param
new file mode 100644
index 0000000..e239058
--- /dev/null
+++ b/examples/squeezenet_v1.1.param
@@ -0,0 +1,77 @@
+7767517
+75 83
+Input            data             0 1 data 0=227 1=227 2=3
+Convolution      conv1            1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728
+ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1 0=0.000000
+Pooling          pool1            1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
+Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024
+ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 0=0.000000
+Split            splitncnn_0      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
+Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024
+ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 0=0.000000
+Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216
+ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 0=0.000000
+Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
+Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048
+ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 0=0.000000
+Split            splitncnn_1      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
+Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024
+ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 0=0.000000
+Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216
+ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 0=0.000000
+Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
+Pooling          pool3            1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
+Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096
+ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 0=0.000000
+Split            splitncnn_2      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
+Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096
+ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 0=0.000000
+Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864
+ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 0=0.000000
+Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
+Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192
+ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 0=0.000000
+Split            splitncnn_3      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
+Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096
+ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 0=0.000000
+Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864
+ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 0=0.000000
+Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
+Pooling          pool5            1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0
+Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288
+ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 0=0.000000
+Split            splitncnn_4      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
+Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216
+ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 0=0.000000
+Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944
+ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 0=0.000000
+Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
+Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432
+ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 0=0.000000
+Split            splitncnn_5      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
+Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216
+ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 0=0.000000
+Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944
+ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 0=0.000000
+Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
+Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576
+ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 0=0.000000
+Split            splitncnn_6      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
+Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384
+ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 0=0.000000
+Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456
+ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 0=0.000000
+Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
+Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768
+ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 0=0.000000
+Split            splitncnn_7      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
+Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384
+ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 0=0.000000
+Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456
+ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 0=0.000000
+Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
+Dropout          drop9            1 1 fire9/concat fire9/concat_drop9
+Convolution      conv10           1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000
+ReLU             relu_conv10      1 1 conv10 conv10_relu_conv10 0=0.000000
+Pooling          pool10           1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1
+Softmax          prob             1 1 pool10 prob 0=0
diff --git a/examples/squeezenet_v1.1.param.bin b/examples/squeezenet_v1.1.param.bin
new file mode 100644
index 0000000..b43d2ac
--- /dev/null
+++ b/examples/squeezenet_v1.1.param.bin
diff --git a/examples/squeezenet_v1.1.prototxt b/examples/squeezenet_v1.1.prototxt
new file mode 100644
index 0000000..7dc9853
--- /dev/null
+++ b/examples/squeezenet_v1.1.prototxt
@@ -0,0 +1,548 @@
+name: "squeezenet_v1.1_deploy"
+
+layer {
+  name: "data"
+  type: "Input"
+  top: "data"
+  input_param { shape: { dim: 1 dim: 3 dim: 227 dim: 227 } }
+}
+layer {
+  name: "conv1"
+  type: "Convolution"
+  bottom: "data"
+  top: "conv1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "relu_conv1"
+  type: "ReLU"
+  bottom: "conv1"
+  top: "conv1"
+}
+layer {
+  name: "pool1"
+  type: "Pooling"
+  bottom: "conv1"
+  top: "pool1"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire2/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool1"
+  top: "fire2/squeeze1x1"
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire2/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/squeeze1x1"
+}
+layer {
+  name: "fire2/expand1x1"
+  type: "Convolution"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/expand1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire2/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire2/expand1x1"
+  top: "fire2/expand1x1"
+}
+layer {
+  name: "fire2/expand3x3"
+  type: "Convolution"
+  bottom: "fire2/squeeze1x1"
+  top: "fire2/expand3x3"
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire2/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire2/expand3x3"
+  top: "fire2/expand3x3"
+}
+layer {
+  name: "fire2/concat"
+  type: "Concat"
+  bottom: "fire2/expand1x1"
+  bottom: "fire2/expand3x3"
+  top: "fire2/concat"
+}
+layer {
+  name: "fire3/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire2/concat"
+  top: "fire3/squeeze1x1"
+  convolution_param {
+    num_output: 16
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire3/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/squeeze1x1"
+}
+layer {
+  name: "fire3/expand1x1"
+  type: "Convolution"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/expand1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire3/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire3/expand1x1"
+  top: "fire3/expand1x1"
+}
+layer {
+  name: "fire3/expand3x3"
+  type: "Convolution"
+  bottom: "fire3/squeeze1x1"
+  top: "fire3/expand3x3"
+  convolution_param {
+    num_output: 64
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire3/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire3/expand3x3"
+  top: "fire3/expand3x3"
+}
+layer {
+  name: "fire3/concat"
+  type: "Concat"
+  bottom: "fire3/expand1x1"
+  bottom: "fire3/expand3x3"
+  top: "fire3/concat"
+}
+layer {
+  name: "pool3"
+  type: "Pooling"
+  bottom: "fire3/concat"
+  top: "pool3"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire4/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool3"
+  top: "fire4/squeeze1x1"
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire4/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/squeeze1x1"
+}
+layer {
+  name: "fire4/expand1x1"
+  type: "Convolution"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/expand1x1"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire4/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire4/expand1x1"
+  top: "fire4/expand1x1"
+}
+layer {
+  name: "fire4/expand3x3"
+  type: "Convolution"
+  bottom: "fire4/squeeze1x1"
+  top: "fire4/expand3x3"
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire4/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire4/expand3x3"
+  top: "fire4/expand3x3"
+}
+layer {
+  name: "fire4/concat"
+  type: "Concat"
+  bottom: "fire4/expand1x1"
+  bottom: "fire4/expand3x3"
+  top: "fire4/concat"
+}
+layer {
+  name: "fire5/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire4/concat"
+  top: "fire5/squeeze1x1"
+  convolution_param {
+    num_output: 32
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire5/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/squeeze1x1"
+}
+layer {
+  name: "fire5/expand1x1"
+  type: "Convolution"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/expand1x1"
+  convolution_param {
+    num_output: 128
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire5/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire5/expand1x1"
+  top: "fire5/expand1x1"
+}
+layer {
+  name: "fire5/expand3x3"
+  type: "Convolution"
+  bottom: "fire5/squeeze1x1"
+  top: "fire5/expand3x3"
+  convolution_param {
+    num_output: 128
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire5/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire5/expand3x3"
+  top: "fire5/expand3x3"
+}
+layer {
+  name: "fire5/concat"
+  type: "Concat"
+  bottom: "fire5/expand1x1"
+  bottom: "fire5/expand3x3"
+  top: "fire5/concat"
+}
+layer {
+  name: "pool5"
+  type: "Pooling"
+  bottom: "fire5/concat"
+  top: "pool5"
+  pooling_param {
+    pool: MAX
+    kernel_size: 3
+    stride: 2
+  }
+}
+layer {
+  name: "fire6/squeeze1x1"
+  type: "Convolution"
+  bottom: "pool5"
+  top: "fire6/squeeze1x1"
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire6/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/squeeze1x1"
+}
+layer {
+  name: "fire6/expand1x1"
+  type: "Convolution"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/expand1x1"
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire6/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire6/expand1x1"
+  top: "fire6/expand1x1"
+}
+layer {
+  name: "fire6/expand3x3"
+  type: "Convolution"
+  bottom: "fire6/squeeze1x1"
+  top: "fire6/expand3x3"
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire6/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire6/expand3x3"
+  top: "fire6/expand3x3"
+}
+layer {
+  name: "fire6/concat"
+  type: "Concat"
+  bottom: "fire6/expand1x1"
+  bottom: "fire6/expand3x3"
+  top: "fire6/concat"
+}
+layer {
+  name: "fire7/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire6/concat"
+  top: "fire7/squeeze1x1"
+  convolution_param {
+    num_output: 48
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire7/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/squeeze1x1"
+}
+layer {
+  name: "fire7/expand1x1"
+  type: "Convolution"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/expand1x1"
+  convolution_param {
+    num_output: 192
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire7/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire7/expand1x1"
+  top: "fire7/expand1x1"
+}
+layer {
+  name: "fire7/expand3x3"
+  type: "Convolution"
+  bottom: "fire7/squeeze1x1"
+  top: "fire7/expand3x3"
+  convolution_param {
+    num_output: 192
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire7/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire7/expand3x3"
+  top: "fire7/expand3x3"
+}
+layer {
+  name: "fire7/concat"
+  type: "Concat"
+  bottom: "fire7/expand1x1"
+  bottom: "fire7/expand3x3"
+  top: "fire7/concat"
+}
+layer {
+  name: "fire8/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire7/concat"
+  top: "fire8/squeeze1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire8/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/squeeze1x1"
+}
+layer {
+  name: "fire8/expand1x1"
+  type: "Convolution"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/expand1x1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire8/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire8/expand1x1"
+  top: "fire8/expand1x1"
+}
+layer {
+  name: "fire8/expand3x3"
+  type: "Convolution"
+  bottom: "fire8/squeeze1x1"
+  top: "fire8/expand3x3"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire8/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire8/expand3x3"
+  top: "fire8/expand3x3"
+}
+layer {
+  name: "fire8/concat"
+  type: "Concat"
+  bottom: "fire8/expand1x1"
+  bottom: "fire8/expand3x3"
+  top: "fire8/concat"
+}
+layer {
+  name: "fire9/squeeze1x1"
+  type: "Convolution"
+  bottom: "fire8/concat"
+  top: "fire9/squeeze1x1"
+  convolution_param {
+    num_output: 64
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire9/relu_squeeze1x1"
+  type: "ReLU"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/squeeze1x1"
+}
+layer {
+  name: "fire9/expand1x1"
+  type: "Convolution"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/expand1x1"
+  convolution_param {
+    num_output: 256
+    kernel_size: 1
+  }
+}
+layer {
+  name: "fire9/relu_expand1x1"
+  type: "ReLU"
+  bottom: "fire9/expand1x1"
+  top: "fire9/expand1x1"
+}
+layer {
+  name: "fire9/expand3x3"
+  type: "Convolution"
+  bottom: "fire9/squeeze1x1"
+  top: "fire9/expand3x3"
+  convolution_param {
+    num_output: 256
+    pad: 1
+    kernel_size: 3
+  }
+}
+layer {
+  name: "fire9/relu_expand3x3"
+  type: "ReLU"
+  bottom: "fire9/expand3x3"
+  top: "fire9/expand3x3"
+}
+layer {
+  name: "fire9/concat"
+  type: "Concat"
+  bottom: "fire9/expand1x1"
+  bottom: "fire9/expand3x3"
+  top: "fire9/concat"
+}
+layer {
+  name: "drop9"
+  type: "Dropout"
+  bottom: "fire9/concat"
+  top: "fire9/concat"
+  dropout_param {
+    dropout_ratio: 0.5
+  }
+}
+layer {
+  name: "conv10"
+  type: "Convolution"
+  bottom: "fire9/concat"
+  top: "conv10"
+  convolution_param {
+    num_output: 1000
+    pad: 1
+    kernel_size: 1
+  }
+}
+layer {
+  name: "relu_conv10"
+  type: "ReLU"
+  bottom: "conv10"
+  top: "conv10"
+}
+layer {
+  name: "pool10"
+  type: "Pooling"
+  bottom: "conv10"
+  top: "pool10"
+  pooling_param {
+    pool: AVE
+    global_pooling: true
+  }
+}
+layer {
+  name: "prob"
+  type: "Softmax"
+  bottom: "pool10"
+  top: "prob"
+}
diff --git a/examples/squeezenetssd.cpp b/examples/squeezenetssd.cpp
new file mode 100644
index 0000000..c233b5b
--- /dev/null
+++ b/examples/squeezenetssd.cpp
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_squeezenet(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net squeezenet;
+
+    squeezenet.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/chuanqi305/SqueezeNet-SSD
+    // squeezenet_ssd_voc_deploy.prototxt
+    // https://drive.google.com/open?id=0B3gersZ2cHIxdGpyZlZnbEQ5Snc
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (squeezenet.load_param("squeezenet_ssd_voc.param"))
+        exit(-1);
+    if (squeezenet.load_model("squeezenet_ssd_voc.bin"))
+        exit(-1);
+
+    const int target_size = 300;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {104.f, 117.f, 123.f};
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Extractor ex = squeezenet.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_squeezenet(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/synset_words.txt b/examples/synset_words.txt
new file mode 100644
index 0000000..1308bd8
--- /dev/null
+++ b/examples/synset_words.txt
@@ -0,0 +1,1000 @@
+n01440764 tench, Tinca tinca
+n01443537 goldfish, Carassius auratus
+n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+n01491361 tiger shark, Galeocerdo cuvieri
+n01494475 hammerhead, hammerhead shark
+n01496331 electric ray, crampfish, numbfish, torpedo
+n01498041 stingray
+n01514668 cock
+n01514859 hen
+n01518878 ostrich, Struthio camelus
+n01530575 brambling, Fringilla montifringilla
+n01531178 goldfinch, Carduelis carduelis
+n01532829 house finch, linnet, Carpodacus mexicans
+n01534433 junco, snowbird
+n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+n01558993 robin, American robin, Turdus migratorius
+n01560419 bulbul
+n01580077 jay
+n01582220 magpie
+n01592084 chickadee
+n01601694 water ouzel, dipper
+n01608432 kite
+n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
+n01616318 vulture
+n01622779 great grey owl, great gray owl, Strix nebulosa
+n01629819 European fire salamander, Salamandra salamandra
+n01630670 common newt, Triturus vulgaris
+n01631663 eft
+n01632458 spotted salamander, Ambystoma maculatum
+n01632777 axolotl, mud puppy, Ambystoma mexicanum
+n01641577 bullfrog, Rana catesbeiana
+n01644373 tree frog, tree-frog
+n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+n01664065 loggerhead, loggerhead turtle, Caretta caretta
+n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+n01667114 mud turtle
+n01667778 terrapin
+n01669191 box turtle, box tortoise
+n01675722 banded gecko
+n01677366 common iguana, iguana, Iguana iguana
+n01682714 American chameleon, anole, Anolis carolinensis
+n01685808 whiptail, whiptail lizard
+n01687978 agama
+n01688243 frilled lizard, Chlamydosaurus kingi
+n01689811 alligator lizard
+n01692333 Gila monster, Heloderma suspectum
+n01693334 green lizard, Lacerta viridis
+n01694178 African chameleon, Chamaeleo chamaeleon
+n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+n01697457 African crocodile, Nile crocodile, Crocodylus niloticus
+n01698640 American alligator, Alligator mississipiensis
+n01704323 triceratops
+n01728572 thunder snake, worm snake, Carphophis amoenus
+n01728920 ringneck snake, ring-necked snake, ring snake
+n01729322 hognose snake, puff adder, sand viper
+n01729977 green snake, grass snake
+n01734418 king snake, kingsnake
+n01735189 garter snake, grass snake
+n01737021 water snake
+n01739381 vine snake
+n01740131 night snake, Hypsiglena torquata
+n01742172 boa constrictor, Constrictor constrictor
+n01744401 rock python, rock snake, Python sebae
+n01748264 Indian cobra, Naja naja
+n01749939 green mamba
+n01751748 sea snake
+n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus
+n01756291 sidewinder, horned rattlesnake, Crotalus cerastes
+n01768244 trilobite
+n01770081 harvestman, daddy longlegs, Phalangium opilio
+n01770393 scorpion
+n01773157 black and gold garden spider, Argiope aurantia
+n01773549 barn spider, Araneus cavaticus
+n01773797 garden spider, Aranea diademata
+n01774384 black widow, Latrodectus mactans
+n01774750 tarantula
+n01775062 wolf spider, hunting spider
+n01776313 tick
+n01784675 centipede
+n01795545 black grouse
+n01796340 ptarmigan
+n01797886 ruffed grouse, partridge, Bonasa umbellus
+n01798484 prairie chicken, prairie grouse, prairie fowl
+n01806143 peacock
+n01806567 quail
+n01807496 partridge
+n01817953 African grey, African gray, Psittacus erithacus
+n01818515 macaw
+n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+n01820546 lorikeet
+n01824575 coucal
+n01828970 bee eater
+n01829413 hornbill
+n01833805 hummingbird
+n01843065 jacamar
+n01843383 toucan
+n01847000 drake
+n01855032 red-breasted merganser, Mergus serrator
+n01855672 goose
+n01860187 black swan, Cygnus atratus
+n01871265 tusker
+n01872401 echidna, spiny anteater, anteater
+n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+n01877812 wallaby, brush kangaroo
+n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+n01883070 wombat
+n01910747 jellyfish
+n01914609 sea anemone, anemone
+n01917289 brain coral
+n01924916 flatworm, platyhelminth
+n01930112 nematode, nematode worm, roundworm
+n01943899 conch
+n01944390 snail
+n01945685 slug
+n01950731 sea slug, nudibranch
+n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore
+n01968897 chambered nautilus, pearly nautilus, nautilus
+n01978287 Dungeness crab, Cancer magister
+n01978455 rock crab, Cancer irroratus
+n01980166 fiddler crab
+n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+n01983481 American lobster, Northern lobster, Maine lobster, Homarus americans
+n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+n01985128 crayfish, crawfish, crawdad, crawdaddy
+n01986214 hermit crab
+n01990800 isopod
+n02002556 white stork, Ciconia ciconia
+n02002724 black stork, Ciconia nigra
+n02006656 spoonbill
+n02007558 flamingo
+n02009229 little blue heron, Egretta caerulea
+n02009912 American egret, great white heron, Egretta albus
+n02011460 bittern
+n02012849 crane
+n02013706 limpkin, Aramus pictus
+n02017213 European gallinule, Porphyrio porphyrio
+n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana
+n02018795 bustard
+n02025239 ruddy turnstone, Arenaria interpres
+n02027492 red-backed sandpiper, dunlin, Erolia alpina
+n02028035 redshank, Tringa totanus
+n02033041 dowitcher
+n02037110 oystercatcher, oyster catcher
+n02051845 pelican
+n02056570 king penguin, Aptenodytes patagonica
+n02058221 albatross, mollymawk
+n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+n02074367 dugong, Dugong dugon
+n02077923 sea lion
+n02085620 Chihuahua
+n02085782 Japanese spaniel
+n02085936 Maltese dog, Maltese terrier, Maltese
+n02086079 Pekinese, Pekingese, Peke
+n02086240 Shih-Tzu
+n02086646 Blenheim spaniel
+n02086910 papillon
+n02087046 toy terrier
+n02087394 Rhodesian ridgeback
+n02088094 Afghan hound, Afghan
+n02088238 basset, basset hound
+n02088364 beagle
+n02088466 bloodhound, sleuthhound
+n02088632 bluetick
+n02089078 black-and-tan coonhound
+n02089867 Walker hound, Walker foxhound
+n02089973 English foxhound
+n02090379 redbone
+n02090622 borzoi, Russian wolfhound
+n02090721 Irish wolfhound
+n02091032 Italian greyhound
+n02091134 whippet
+n02091244 Ibizan hound, Ibizan Podenco
+n02091467 Norwegian elkhound, elkhound
+n02091635 otterhound, otter hound
+n02091831 Saluki, gazelle hound
+n02092002 Scottish deerhound, deerhound
+n02092339 Weimaraner
+n02093256 Staffordshire bullterrier, Staffordshire bull terrier
+n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+n02093647 Bedlington terrier
+n02093754 Border terrier
+n02093859 Kerry blue terrier
+n02093991 Irish terrier
+n02094114 Norfolk terrier
+n02094258 Norwich terrier
+n02094433 Yorkshire terrier
+n02095314 wire-haired fox terrier
+n02095570 Lakeland terrier
+n02095889 Sealyham terrier, Sealyham
+n02096051 Airedale, Airedale terrier
+n02096177 cairn, cairn terrier
+n02096294 Australian terrier
+n02096437 Dandie Dinmont, Dandie Dinmont terrier
+n02096585 Boston bull, Boston terrier
+n02097047 miniature schnauzer
+n02097130 giant schnauzer
+n02097209 standard schnauzer
+n02097298 Scotch terrier, Scottish terrier, Scottie
+n02097474 Tibetan terrier, chrysanthemum dog
+n02097658 silky terrier, Sydney silky
+n02098105 soft-coated wheaten terrier
+n02098286 West Highland white terrier
+n02098413 Lhasa, Lhasa apso
+n02099267 flat-coated retriever
+n02099429 curly-coated retriever
+n02099601 golden retriever
+n02099712 Labrador retriever
+n02099849 Chesapeake Bay retriever
+n02100236 German short-haired pointer
+n02100583 vizsla, Hungarian pointer
+n02100735 English setter
+n02100877 Irish setter, red setter
+n02101006 Gordon setter
+n02101388 Brittany spaniel
+n02101556 clumber, clumber spaniel
+n02102040 English springer, English springer spaniel
+n02102177 Welsh springer spaniel
+n02102318 cocker spaniel, English cocker spaniel, cocker
+n02102480 Sussex spaniel
+n02102973 Irish water spaniel
+n02104029 kuvasz
+n02104365 schipperke
+n02105056 groenendael
+n02105162 malinois
+n02105251 briard
+n02105412 kelpie
+n02105505 komondor
+n02105641 Old English sheepdog, bobtail
+n02105855 Shetland sheepdog, Shetland sheep dog, Shetland
+n02106030 collie
+n02106166 Border collie
+n02106382 Bouvier des Flandres, Bouviers des Flandres
+n02106550 Rottweiler
+n02106662 German shepherd, German shepherd dog, German police dog, alsatian
+n02107142 Doberman, Doberman pinscher
+n02107312 miniature pinscher
+n02107574 Greater Swiss Mountain dog
+n02107683 Bernese mountain dog
+n02107908 Appenzeller
+n02108000 EntleBucher
+n02108089 boxer
+n02108422 bull mastiff
+n02108551 Tibetan mastiff
+n02108915 French bulldog
+n02109047 Great Dane
+n02109525 Saint Bernard, St Bernard
+n02109961 Eskimo dog, husky
+n02110063 malamute, malemute, Alaskan malamute
+n02110185 Siberian husky
+n02110341 dalmatian, coach dog, carriage dog
+n02110627 affenpinscher, monkey pinscher, monkey dog
+n02110806 basenji
+n02110958 pug, pug-dog
+n02111129 Leonberg
+n02111277 Newfoundland, Newfoundland dog
+n02111500 Great Pyrenees
+n02111889 Samoyed, Samoyede
+n02112018 Pomeranian
+n02112137 chow, chow chow
+n02112350 keeshond
+n02112706 Brabancon griffon
+n02113023 Pembroke, Pembroke Welsh corgi
+n02113186 Cardigan, Cardigan Welsh corgi
+n02113624 toy poodle
+n02113712 miniature poodle
+n02113799 standard poodle
+n02113978 Mexican hairless
+n02114367 timber wolf, grey wolf, gray wolf, Canis lupus
+n02114548 white wolf, Arctic wolf, Canis lupus tundrarum
+n02114712 red wolf, maned wolf, Canis rufus, Canis niger
+n02114855 coyote, prairie wolf, brush wolf, Canis latrans
+n02115641 dingo, warrigal, warragal, Canis dingo
+n02115913 dhole, Cuon alpinus
+n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+n02117135 hyena, hyaena
+n02119022 red fox, Vulpes vulpes
+n02119789 kit fox, Vulpes macrotis
+n02120079 Arctic fox, white fox, Alopex lagopus
+n02120505 grey fox, gray fox, Urocyon cinereoargenteus
+n02123045 tabby, tabby cat
+n02123159 tiger cat
+n02123394 Persian cat
+n02123597 Siamese cat, Siamese
+n02124075 Egyptian cat
+n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+n02127052 lynx, catamount
+n02128385 leopard, Panthera pardus
+n02128757 snow leopard, ounce, Panthera uncia
+n02128925 jaguar, panther, Panthera onca, Felis onca
+n02129165 lion, king of beasts, Panthera leo
+n02129604 tiger, Panthera tigris
+n02130308 cheetah, chetah, Acinonyx jubatus
+n02132136 brown bear, bruin, Ursus arctos
+n02133161 American black bear, black bear, Ursus americans, Euarctos americans
+n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+n02134418 sloth bear, Melursus ursinus, Ursus ursinus
+n02137549 mongoose
+n02138441 meerkat, mierkat
+n02165105 tiger beetle
+n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+n02167151 ground beetle, carabid beetle
+n02168699 long-horned beetle, longicorn, longicorn beetle
+n02169497 leaf beetle, chrysomelid
+n02172182 dung beetle
+n02174001 rhinoceros beetle
+n02177972 weevil
+n02190166 fly
+n02206856 bee
+n02219486 ant, emmet, pismire
+n02226429 grasshopper, hopper
+n02229544 cricket
+n02231487 walking stick, walkingstick, stick insect
+n02233338 cockroach, roach
+n02236044 mantis, mantid
+n02256656 cicada, cicala
+n02259212 leafhopper
+n02264363 lacewing, lacewing fly
+n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+n02268853 damselfly
+n02276258 admiral
+n02277742 ringlet, ringlet butterfly
+n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+n02280649 cabbage butterfly
+n02281406 sulphur butterfly, sulfur butterfly
+n02281787 lycaenid, lycaenid butterfly
+n02317335 starfish, sea star
+n02319095 sea urchin
+n02321529 sea cucumber, holothurian
+n02325366 wood rabbit, cottontail, cottontail rabbit
+n02326432 hare
+n02328150 Angora, Angora rabbit
+n02342885 hamster
+n02346627 porcupine, hedgehog
+n02356798 fox squirrel, eastern fox squirrel, Sciurus niger
+n02361337 marmot
+n02363005 beaver
+n02364673 guinea pig, Cavia cobaya
+n02389026 sorrel
+n02391049 zebra
+n02395406 hog, pig, grunter, squealer, Sus scrofa
+n02396427 wild boar, boar, Sus scrofa
+n02397096 warthog
+n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius
+n02403003 ox
+n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+n02410509 bison
+n02412080 ram, tup
+n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+n02417914 ibex, Capra ibex
+n02422106 hartebeest
+n02422699 impala, Aepyceros melampus
+n02423022 gazelle
+n02437312 Arabian camel, dromedary, Camelus dromedarius
+n02437616 llama
+n02441942 weasel
+n02442845 mink
+n02443114 polecat, fitch, foulmart, foumart, Mustela putorius
+n02443484 black-footed ferret, ferret, Mustela nigripes
+n02444819 otter
+n02445715 skunk, polecat, wood pussy
+n02447366 badger
+n02454379 armadillo
+n02457408 three-toed sloth, ai, Bradypus tridactylus
+n02480495 orangutan, orang, orangutang, Pongo pygmaeus
+n02480855 gorilla, Gorilla gorilla
+n02481823 chimpanzee, chimp, Pan troglodytes
+n02483362 gibbon, Hylobates lar
+n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus
+n02484975 guenon, guenon monkey
+n02486261 patas, hussar monkey, Erythrocebus patas
+n02486410 baboon
+n02487347 macaque
+n02488291 langur
+n02488702 colobus, colobus monkey
+n02489166 proboscis monkey, Nasalis larvatus
+n02490219 marmoset
+n02492035 capuchin, ringtail, Cebus capucinus
+n02492660 howler monkey, howler
+n02493509 titi, titi monkey
+n02493793 spider monkey, Ateles geoffroyi
+n02494079 squirrel monkey, Saimiri sciureus
+n02497673 Madagascar cat, ring-tailed lemur, Lemur catta
+n02500267 indri, indris, Indri indri, Indri brevicaudatus
+n02504013 Indian elephant, Elephas maximus
+n02504458 African elephant, Loxodonta africana
+n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+n02514041 barracouta, snoek
+n02526121 eel
+n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+n02606052 rock beauty, Holocanthus tricolor
+n02607072 anemone fish
+n02640242 sturgeon
+n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus
+n02643566 lionfish
+n02655020 puffer, pufferfish, blowfish, globefish
+n02666196 abacus
+n02667093 abaya
+n02669723 academic gown, academic robe, judge's robe
+n02672831 accordion, piano accordion, squeeze box
+n02676566 acoustic guitar
+n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier
+n02690373 airliner
+n02692877 airship, dirigible
+n02699494 altar
+n02701002 ambulance
+n02704792 amphibian, amphibious vehicle
+n02708093 analog clock
+n02727426 apiary, bee house
+n02730930 apron
+n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+n02749479 assault rifle, assault gun
+n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack
+n02776631 bakery, bakeshop, bakehouse
+n02777292 balance beam, beam
+n02782093 balloon
+n02783161 ballpoint, ballpoint pen, ballpen, Biro
+n02786058 Band Aid
+n02787622 banjo
+n02788148 bannister, banister, balustrade, balusters, handrail
+n02790996 barbell
+n02791124 barber chair
+n02791270 barbershop
+n02793495 barn
+n02794156 barometer
+n02795169 barrel, cask
+n02797295 barrow, garden cart, lawn cart, wheelbarrow
+n02799071 baseball
+n02802426 basketball
+n02804414 bassinet
+n02804610 bassoon
+n02807133 bathing cap, swimming cap
+n02808304 bath towel
+n02808440 bathtub, bathing tub, bath, tub
+n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+n02814860 beacon, lighthouse, beacon light, pharos
+n02815834 beaker
+n02817516 bearskin, busby, shako
+n02823428 beer bottle
+n02823750 beer glass
+n02825657 bell cote, bell cot
+n02834397 bib
+n02835271 bicycle-built-for-two, tandem bicycle, tandem
+n02837789 bikini, two-piece
+n02840245 binder, ring-binder
+n02841315 binoculars, field glasses, opera glasses
+n02843684 birdhouse
+n02859443 boathouse
+n02860847 bobsled, bobsleigh, bob
+n02865351 bolo tie, bolo, bola tie, bola
+n02869837 bonnet, poke bonnet
+n02870880 bookcase
+n02871525 bookshop, bookstore, bookstall
+n02877765 bottlecap
+n02879718 bow
+n02883205 bow tie, bow-tie, bowtie
+n02892201 brass, memorial tablet, plaque
+n02892767 brassiere, bra, bandeau
+n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+n02895154 breastplate, aegis, egis
+n02906734 broom
+n02909870 bucket, pail
+n02910353 buckle
+n02916936 bulletproof vest
+n02917067 bullet train, bullet
+n02927161 butcher shop, meat market
+n02930766 cab, hack, taxi, taxicab
+n02939185 caldron, cauldron
+n02948072 candle, taper, wax light
+n02950826 cannon
+n02951358 canoe
+n02951585 can opener, tin opener
+n02963159 cardigan
+n02965783 car mirror
+n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig
+n02966687 carpenter's kit, tool kit
+n02971356 carton
+n02974003 car wheel
+n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+n02978881 cassette
+n02979186 cassette player
+n02980441 castle
+n02981792 catamaran
+n02988304 CD player
+n02992211 cello, violoncello
+n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone
+n02999410 chain
+n03000134 chainlink fence
+n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+n03000684 chain saw, chainsaw
+n03014705 chest
+n03016953 chiffonier, commode
+n03017168 chime, bell, gong
+n03018349 china cabinet, china closet
+n03026506 Christmas stocking
+n03028079 church, church building
+n03032252 cinema, movie theater, movie theatre, movie house, picture palace
+n03041632 cleaver, meat cleaver, chopper
+n03042490 cliff dwelling
+n03045698 cloak
+n03047690 clog, geta, patten, sabot
+n03062245 cocktail shaker
+n03063599 coffee mug
+n03063689 coffeepot
+n03065424 coil, spiral, volute, whorl, helix
+n03075370 combination lock
+n03085013 computer keyboard, keypad
+n03089624 confectionery, confectionary, candy store
+n03095699 container ship, containership, container vessel
+n03100240 convertible
+n03109150 corkscrew, bottle screw
+n03110669 cornet, horn, trumpet, trump
+n03124043 cowboy boot
+n03124170 cowboy hat, ten-gallon hat
+n03125729 cradle
+n03126707 crane
+n03127747 crash helmet
+n03127925 crate
+n03131574 crib, cot
+n03133878 Crock Pot
+n03134739 croquet ball
+n03141823 crutch
+n03146219 cuirass
+n03160309 dam, dike, dyke
+n03179701 desk
+n03180011 desktop computer
+n03187595 dial telephone, dial phone
+n03188531 diaper, nappy, napkin
+n03196217 digital clock
+n03197337 digital watch
+n03201208 dining table, board
+n03207743 dishrag, dishcloth
+n03207941 dishwasher, dish washer, dishwashing machine
+n03208938 disk brake, disc brake
+n03216828 dock, dockage, docking facility
+n03218198 dogsled, dog sled, dog sleigh
+n03220513 dome
+n03223299 doormat, welcome mat
+n03240683 drilling platform, offshore rig
+n03249569 drum, membranophone, tympan
+n03250847 drumstick
+n03255030 dumbbell
+n03259280 Dutch oven
+n03271574 electric fan, blower
+n03272010 electric guitar
+n03272562 electric locomotive
+n03290653 entertainment center
+n03291819 envelope
+n03297495 espresso maker
+n03314780 face powder
+n03325584 feather boa, boa
+n03337140 file, file cabinet, filing cabinet
+n03344393 fireboat
+n03345487 fire engine, fire truck
+n03347037 fire screen, fireguard
+n03355925 flagpole, flagstaff
+n03372029 flute, transverse flute
+n03376595 folding chair
+n03379051 football helmet
+n03384352 forklift
+n03388043 fountain
+n03388183 fountain pen
+n03388549 four-poster
+n03393912 freight car
+n03394916 French horn, horn
+n03400231 frying pan, frypan, skillet
+n03404251 fur coat
+n03417042 garbage truck, dustcart
+n03424325 gasmask, respirator, gas helmet
+n03425413 gas pump, gasoline pump, petrol pump, island dispenser
+n03443371 goblet
+n03444034 go-kart
+n03445777 golf ball
+n03445924 golfcart, golf cart
+n03447447 gondola
+n03447721 gong, tam-tam
+n03450230 gown
+n03452741 grand piano, grand
+n03457902 greenhouse, nursery, glasshouse
+n03459775 grille, radiator grille
+n03461385 grocery store, grocery, food market, market
+n03467068 guillotine
+n03476684 hair slide
+n03476991 hair spray
+n03478589 half track
+n03481172 hammer
+n03482405 hamper
+n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier
+n03485407 hand-held computer, hand-held microcomputer
+n03485794 handkerchief, hankie, hanky, hankey
+n03492542 hard disc, hard disk, fixed disk
+n03494278 harmonica, mouth organ, harp, mouth harp
+n03495258 harp
+n03496892 harvester, reaper
+n03498962 hatchet
+n03527444 holster
+n03529860 home theater, home theatre
+n03530642 honeycomb
+n03532672 hook, claw
+n03534580 hoopskirt, crinoline
+n03535780 horizontal bar, high bar
+n03538406 horse cart, horse-cart
+n03544143 hourglass
+n03584254 iPod
+n03584829 iron, smoothing iron
+n03590841 jack-o'-lantern
+n03594734 jean, blue jean, denim
+n03594945 jeep, landrover
+n03595614 jersey, T-shirt, tee shirt
+n03598930 jigsaw puzzle
+n03599486 jinrikisha, ricksha, rickshaw
+n03602883 joystick
+n03617480 kimono
+n03623198 knee pad
+n03627232 knot
+n03630383 lab coat, laboratory coat
+n03633091 ladle
+n03637318 lampshade, lamp shade
+n03642806 laptop, laptop computer
+n03649909 lawn mower, mower
+n03657121 lens cap, lens cover
+n03658185 letter opener, paper knife, paperknife
+n03661043 library
+n03662601 lifeboat
+n03666591 lighter, light, igniter, ignitor
+n03670208 limousine, limo
+n03673027 liner, ocean liner
+n03676483 lipstick, lip rouge
+n03680355 Loafer
+n03690938 lotion
+n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+n03692522 loupe, jeweler's loupe
+n03697007 lumbermill, sawmill
+n03706229 magnetic compass
+n03709823 mailbag, postbag
+n03710193 mailbox, letter box
+n03710637 maillot
+n03710721 maillot, tank suit
+n03717622 manhole cover
+n03720891 maraca
+n03721384 marimba, xylophone
+n03724870 mask
+n03729826 matchstick
+n03733131 maypole
+n03733281 maze, labyrinth
+n03733805 measuring cup
+n03742115 medicine chest, medicine cabinet
+n03743016 megalith, megalithic structure
+n03759954 microphone, mike
+n03761084 microwave, microwave oven
+n03763968 military uniform
+n03764736 milk can
+n03769881 minibus
+n03770439 miniskirt, mini
+n03770679 minivan
+n03773504 missile
+n03775071 mitten
+n03775546 mixing bowl
+n03776460 mobile home, manufactured home
+n03777568 Model T
+n03777754 modem
+n03781244 monastery
+n03782006 monitor
+n03785016 moped
+n03786901 mortar
+n03787032 mortarboard
+n03788195 mosque
+n03788365 mosquito net
+n03791053 motor scooter, scooter
+n03792782 mountain bike, all-terrain bike, off-roader
+n03792972 mountain tent
+n03793489 mouse, computer mouse
+n03794056 mousetrap
+n03796401 moving van
+n03803284 muzzle
+n03804744 nail
+n03814639 neck brace
+n03814906 necklace
+n03825788 nipple
+n03832673 notebook, notebook computer
+n03837869 obelisk
+n03838899 oboe, hautboy, hautbois
+n03840681 ocarina, sweet potato
+n03841143 odometer, hodometer, mileometer, milometer
+n03843555 oil filter
+n03854065 organ, pipe organ
+n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO
+n03866082 overskirt
+n03868242 oxcart
+n03868863 oxygen mask
+n03871628 packet
+n03873416 paddle, boat paddle
+n03874293 paddlewheel, paddle wheel
+n03874599 padlock
+n03876231 paintbrush
+n03877472 pajama, pyjama, pj's, jammies
+n03877845 palace
+n03884397 panpipe, pandean pipe, syrinx
+n03887697 paper towel
+n03888257 parachute, chute
+n03888605 parallel bars, bars
+n03891251 park bench
+n03891332 parking meter
+n03895866 passenger car, coach, carriage
+n03899768 patio, terrace
+n03902125 pay-phone, pay-station
+n03903868 pedestal, plinth, footstall
+n03908618 pencil box, pencil case
+n03908714 pencil sharpener
+n03916031 perfume, essence
+n03920288 Petri dish
+n03924679 photocopier
+n03929660 pick, plectrum, plectron
+n03929855 pickelhaube
+n03930313 picket fence, paling
+n03930630 pickup, pickup truck
+n03933933 pier
+n03935335 piggy bank, penny bank
+n03937543 pill bottle
+n03938244 pillow
+n03942813 ping-pong ball
+n03944341 pinwheel
+n03947888 pirate, pirate ship
+n03950228 pitcher, ewer
+n03954731 plane, carpenter's plane, woodworking plane
+n03956157 planetarium
+n03958227 plastic bag
+n03961711 plate rack
+n03967562 plow, plough
+n03970156 plunger, plumber's helper
+n03976467 Polaroid camera, Polaroid Land camera
+n03976657 pole
+n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+n03980874 poncho
+n03982430 pool table, billiard table, snooker table
+n03983396 pop bottle, soda bottle
+n03991062 pot, flowerpot
+n03992509 potter's wheel
+n03995372 power drill
+n03998194 prayer rug, prayer mat
+n04004767 printer
+n04005630 prison, prison house
+n04008634 projectile, missile
+n04009552 projector
+n04019541 puck, hockey puck
+n04023962 punching bag, punch bag, punching ball, punchball
+n04026417 purse
+n04033901 quill, quill pen
+n04033995 quilt, comforter, comfort, puff
+n04037443 racer, race car, racing car
+n04039381 racket, racquet
+n04040759 radiator
+n04041544 radio, wireless
+n04044716 radio telescope, radio reflector
+n04049303 rain barrel
+n04065272 recreational vehicle, RV, R.V.
+n04067472 reel
+n04069434 reflex camera
+n04070727 refrigerator, icebox
+n04074963 remote control, remote
+n04081281 restaurant, eating house, eating place, eatery
+n04086273 revolver, six-gun, six-shooter
+n04090263 rifle
+n04099969 rocking chair, rocker
+n04111531 rotisserie
+n04116512 rubber eraser, rubber, pencil eraser
+n04118538 rugby ball
+n04118776 rule, ruler
+n04120489 running shoe
+n04125021 safe
+n04127249 safety pin
+n04131690 saltshaker, salt shaker
+n04133789 sandal
+n04136333 sarong
+n04141076 sax, saxophone
+n04141327 scabbard
+n04141975 scale, weighing machine
+n04146614 school bus
+n04147183 schooner
+n04149813 scoreboard
+n04152593 screen, CRT screen
+n04153751 screw
+n04154565 screwdriver
+n04162706 seat belt, seatbelt
+n04179913 sewing machine
+n04192698 shield, buckler
+n04200800 shoe shop, shoe-shop, shoe store
+n04201297 shoji
+n04204238 shopping basket
+n04204347 shopping cart
+n04208210 shovel
+n04209133 shower cap
+n04209239 shower curtain
+n04228054 ski
+n04229816 ski mask
+n04235860 sleeping bag
+n04238763 slide rule, slipstick
+n04239074 sliding door
+n04243546 slot, one-armed bandit
+n04251144 snorkel
+n04252077 snowmobile
+n04252225 snowplow, snowplough
+n04254120 soap dispenser
+n04254680 soccer ball
+n04254777 sock
+n04258138 solar dish, solar collector, solar furnace
+n04259630 sombrero
+n04263257 soup bowl
+n04264628 space bar
+n04265275 space heater
+n04266014 space shuttle
+n04270147 spatula
+n04273569 speedboat
+n04275548 spider web, spider's web
+n04277352 spindle
+n04285008 sports car, sport car
+n04286575 spotlight, spot
+n04296562 stage
+n04310018 steam locomotive
+n04311004 steel arch bridge
+n04311174 steel drum
+n04317175 stethoscope
+n04325704 stole
+n04326547 stone wall
+n04328186 stopwatch, stop watch
+n04330267 stove
+n04332243 strainer
+n04335435 streetcar, tram, tramcar, trolley, trolley car
+n04336792 stretcher
+n04344873 studio couch, day bed
+n04346328 stupa, tope
+n04347754 submarine, pigboat, sub, U-boat
+n04350905 suit, suit of clothes
+n04355338 sundial
+n04355933 sunglass
+n04356056 sunglasses, dark glasses, shades
+n04357314 sunscreen, sunblock, sun blocker
+n04366367 suspension bridge
+n04367480 swab, swob, mop
+n04370456 sweatshirt
+n04371430 swimming trunks, bathing trunks
+n04371774 swing
+n04372370 switch, electric switch, electrical switch
+n04376876 syringe
+n04380533 table lamp
+n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle
+n04392985 tape player
+n04398044 teapot
+n04399382 teddy, teddy bear
+n04404412 television, television system
+n04409515 tennis ball
+n04417672 thatch, thatched roof
+n04418357 theater curtain, theatre curtain
+n04423845 thimble
+n04428191 thresher, thrasher, threshing machine
+n04429376 throne
+n04435653 tile roof
+n04442312 toaster
+n04443257 tobacco shop, tobacconist shop, tobacconist
+n04447861 toilet seat
+n04456115 torch
+n04458633 totem pole
+n04461696 tow truck, tow car, wrecker
+n04462240 toyshop
+n04465501 tractor
+n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+n04476259 tray
+n04479046 trench coat
+n04482393 tricycle, trike, velocipede
+n04483307 trimaran
+n04485082 tripod
+n04486054 triumphal arch
+n04487081 trolleybus, trolley coach, trackless trolley
+n04487394 trombone
+n04493381 tub, vat
+n04501370 turnstile
+n04505470 typewriter keyboard
+n04507155 umbrella
+n04509417 unicycle, monocycle
+n04515003 upright, upright piano
+n04517823 vacuum, vacuum cleaner
+n04522168 vase
+n04523525 vault
+n04525038 velvet
+n04525305 vending machine
+n04532106 vestment
+n04532670 viaduct
+n04536866 violin, fiddle
+n04540053 volleyball
+n04542943 waffle iron
+n04548280 wall clock
+n04548362 wallet, billfold, notecase, pocketbook
+n04550184 wardrobe, closet, press
+n04552348 warplane, military plane
+n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+n04554684 washer, automatic washer, washing machine
+n04557648 water bottle
+n04560804 water jug
+n04562935 water tower
+n04579145 whiskey jug
+n04579432 whistle
+n04584207 wig
+n04589890 window screen
+n04590129 window shade
+n04591157 Windsor tie
+n04591713 wine bottle
+n04592741 wing
+n04596742 wok
+n04597913 wooden spoon
+n04599235 wool, woolen, woollen
+n04604644 worm fence, snake fence, snake-rail fence, Virginia fence
+n04606251 wreck
+n04612504 yawl
+n04613696 yurt
+n06359193 web site, website, internet site, site
+n06596364 comic book
+n06785654 crossword puzzle, crossword
+n06794110 street sign
+n06874185 traffic light, traffic signal, stoplight
+n07248320 book jacket, dust cover, dust jacket, dust wrapper
+n07565083 menu
+n07579787 plate
+n07583066 guacamole
+n07584110 consomme
+n07590611 hot pot, hotpot
+n07613480 trifle
+n07614500 ice cream, icecream
+n07615774 ice lolly, lolly, lollipop, popsicle
+n07684084 French loaf
+n07693725 bagel, beigel
+n07695742 pretzel
+n07697313 cheeseburger
+n07697537 hotdog, hot dog, red hot
+n07711569 mashed potato
+n07714571 head cabbage
+n07714990 broccoli
+n07715103 cauliflower
+n07716358 zucchini, courgette
+n07716906 spaghetti squash
+n07717410 acorn squash
+n07717556 butternut squash
+n07718472 cucumber, cuke
+n07718747 artichoke, globe artichoke
+n07720875 bell pepper
+n07730033 cardoon
+n07734744 mushroom
+n07742313 Granny Smith
+n07745940 strawberry
+n07747607 orange
+n07749582 lemon
+n07753113 fig
+n07753275 pineapple, ananas
+n07753592 banana
+n07754684 jackfruit, jak, jack
+n07760859 custard apple
+n07768694 pomegranate
+n07802026 hay
+n07831146 carbonara
+n07836838 chocolate sauce, chocolate syrup
+n07860988 dough
+n07871810 meat loaf, meatloaf
+n07873807 pizza, pizza pie
+n07875152 potpie
+n07880968 burrito
+n07892512 red wine
+n07920052 espresso
+n07930864 cup
+n07932039 eggnog
+n09193705 alp
+n09229709 bubble
+n09246464 cliff, drop, drop-off
+n09256479 coral reef
+n09288635 geyser
+n09332890 lakeside, lakeshore
+n09399592 promontory, headland, head, foreland
+n09421951 sandbar, sand bar
+n09428293 seashore, coast, seacoast, sea-coast
+n09468604 valley, vale
+n09472597 volcano
+n09835506 ballplayer, baseball player
+n10148035 groom, bridegroom
+n10565667 scuba diver
+n11879895 rapeseed
+n11939491 daisy
+n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+n12144580 corn
+n12267677 acorn
+n12620546 hip, rose hip, rosehip
+n12768682 buckeye, horse chestnut, conker
+n12985857 coral fungus
+n12998815 agaric
+n13037406 gyromitra
+n13040303 stinkhorn, carrion fungus
+n13044778 earthstar
+n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+n13054560 bolete
+n13133613 ear, spike, capitulum
+n15075141 toilet tissue, toilet paper, bathroom tissue
diff --git a/examples/yolact.cpp b/examples/yolact.cpp
new file mode 100644
index 0000000..44e24f3
--- /dev/null
+++ b/examples/yolact.cpp
@@ -0,0 +1,549 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+    std::vector<float> maskdata;
+    cv::Mat mask;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolact;
+
+    yolact.opt.use_vulkan_compute = true;
+
+    // original model converted from https://github.com/dbolya/yolact
+    // yolact_resnet50_54_800000.pth
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (yolact.load_param("yolact.param"))
+        exit(-1);
+    if (yolact.load_model("yolact.bin"))
+        exit(-1);
+
+    const int target_size = 550;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, target_size, target_size);
+
+    const float mean_vals[3] = {123.68f, 116.78f, 103.94f};
+    const float norm_vals[3] = {1.0 / 58.40f, 1.0 / 57.12f, 1.0 / 57.38f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = yolact.create_extractor();
+
+    ex.input("input.1", in);
+
+    ncnn::Mat maskmaps;
+    ncnn::Mat location;
+    ncnn::Mat mask;
+    ncnn::Mat confidence;
+
+    ex.extract("619", maskmaps); // 138x138 x 32
+
+    ex.extract("816", location);   // 4 x 19248
+    ex.extract("818", mask);       // maskdim 32 x 19248
+    ex.extract("820", confidence); // 81 x 19248
+
+    int num_class = confidence.w;
+    int num_priors = confidence.h;
+
+    // make priorbox
+    ncnn::Mat priorbox(4, num_priors);
+    {
+        const int conv_ws[5] = {69, 35, 18, 9, 5};
+        const int conv_hs[5] = {69, 35, 18, 9, 5};
+
+        const float aspect_ratios[3] = {1.f, 0.5f, 2.f};
+        const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f};
+
+        float* pb = priorbox;
+
+        for (int p = 0; p < 5; p++)
+        {
+            int conv_w = conv_ws[p];
+            int conv_h = conv_hs[p];
+
+            float scale = scales[p];
+
+            for (int i = 0; i < conv_h; i++)
+            {
+                for (int j = 0; j < conv_w; j++)
+                {
+                    // +0.5 because priors are in center-size notation
+                    float cx = (j + 0.5f) / conv_w;
+                    float cy = (i + 0.5f) / conv_h;
+
+                    for (int k = 0; k < 3; k++)
+                    {
+                        float ar = aspect_ratios[k];
+
+                        ar = sqrt(ar);
+
+                        float w = scale * ar / 550;
+                        float h = scale / ar / 550;
+
+                        // This is for backward compatibility with a bug where I made everything square by accident
+                        // cfg.backbone.use_square_anchors:
+                        h = w;
+
+                        pb[0] = cx;
+                        pb[1] = cy;
+                        pb[2] = w;
+                        pb[3] = h;
+
+                        pb += 4;
+                    }
+                }
+            }
+        }
+    }
+
+    const float confidence_thresh = 0.05f;
+    const float nms_threshold = 0.5f;
+    const int keep_top_k = 200;
+
+    std::vector<std::vector<Object> > class_candidates;
+    class_candidates.resize(num_class);
+
+    for (int i = 0; i < num_priors; i++)
+    {
+        const float* conf = confidence.row(i);
+        const float* loc = location.row(i);
+        const float* pb = priorbox.row(i);
+        const float* maskdata = mask.row(i);
+
+        // find class id with highest score
+        // start from 1 to skip background
+        int label = 0;
+        float score = 0.f;
+        for (int j = 1; j < num_class; j++)
+        {
+            float class_score = conf[j];
+            if (class_score > score)
+            {
+                label = j;
+                score = class_score;
+            }
+        }
+
+        // ignore background or low score
+        if (label == 0 || score <= confidence_thresh)
+            continue;
+
+        // CENTER_SIZE
+        float var[4] = {0.1f, 0.1f, 0.2f, 0.2f};
+
+        float pb_cx = pb[0];
+        float pb_cy = pb[1];
+        float pb_w = pb[2];
+        float pb_h = pb[3];
+
+        float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
+        float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
+        float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w);
+        float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h);
+
+        float obj_x1 = bbox_cx - bbox_w * 0.5f;
+        float obj_y1 = bbox_cy - bbox_h * 0.5f;
+        float obj_x2 = bbox_cx + bbox_w * 0.5f;
+        float obj_y2 = bbox_cy + bbox_h * 0.5f;
+
+        // clip
+        obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+        obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
+        obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
+
+        // append object
+        Object obj;
+        obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
+        obj.label = label;
+        obj.prob = score;
+        obj.maskdata = std::vector<float>(maskdata, maskdata + mask.w);
+
+        class_candidates[label].push_back(obj);
+    }
+
+    objects.clear();
+    for (int i = 0; i < (int)class_candidates.size(); i++)
+    {
+        std::vector<Object>& candidates = class_candidates[i];
+
+        qsort_descent_inplace(candidates);
+
+        std::vector<int> picked;
+        nms_sorted_bboxes(candidates, picked, nms_threshold);
+
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            int z = picked[j];
+            objects.push_back(candidates[z]);
+        }
+    }
+
+    qsort_descent_inplace(objects);
+
+    // keep_top_k
+    if (keep_top_k < (int)objects.size())
+    {
+        objects.resize(keep_top_k);
+    }
+
+    // generate mask
+    for (int i = 0; i < (int)objects.size(); i++)
+    {
+        Object& obj = objects[i];
+
+        cv::Mat mask(maskmaps.h, maskmaps.w, CV_32FC1);
+        {
+            mask = cv::Scalar(0.f);
+
+            for (int p = 0; p < maskmaps.c; p++)
+            {
+                const float* maskmap = maskmaps.channel(p);
+                float coeff = obj.maskdata[p];
+                float* mp = (float*)mask.data;
+
+                // mask += m * coeff
+                for (int j = 0; j < maskmaps.w * maskmaps.h; j++)
+                {
+                    mp[j] += maskmap[j] * coeff;
+                }
+            }
+        }
+
+        cv::Mat mask2;
+        cv::resize(mask, mask2, cv::Size(img_w, img_h));
+
+        // crop obj box and binarize
+        obj.mask = cv::Mat(img_h, img_w, CV_8UC1);
+        {
+            obj.mask = cv::Scalar(0);
+
+            for (int y = 0; y < img_h; y++)
+            {
+                if (y < obj.rect.y || y > obj.rect.y + obj.rect.height)
+                    continue;
+
+                const float* mp2 = mask2.ptr<const float>(y);
+                uchar* bmp = obj.mask.ptr<uchar>(y);
+
+                for (int x = 0; x < img_w; x++)
+                {
+                    if (x < obj.rect.x || x > obj.rect.x + obj.rect.width)
+                        continue;
+
+                    bmp[x] = mp2[x] > 0.5f ? 255 : 0;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "person", "bicycle", "car", "motorcycle", "airplane", "bus",
+                                        "train", "truck", "boat", "traffic light", "fire hydrant",
+                                        "stop sign", "parking meter", "bench", "bird", "cat", "dog",
+                                        "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+                                        "skis", "snowboard", "sports ball", "kite", "baseball bat",
+                                        "baseball glove", "skateboard", "surfboard", "tennis racket",
+                                        "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
+                                        "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
+                                        "hot dog", "pizza", "donut", "cake", "chair", "couch",
+                                        "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
+                                        "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
+                                        "toaster", "sink", "refrigerator", "book", "clock", "vase",
+                                        "scissors", "teddy bear", "hair drier", "toothbrush"
+                                       };
+
+    static const unsigned char colors[81][3] = {
+        {56, 0, 255},
+        {226, 255, 0},
+        {0, 94, 255},
+        {0, 37, 255},
+        {0, 255, 94},
+        {255, 226, 0},
+        {0, 18, 255},
+        {255, 151, 0},
+        {170, 0, 255},
+        {0, 255, 56},
+        {255, 0, 75},
+        {0, 75, 255},
+        {0, 255, 169},
+        {255, 0, 207},
+        {75, 255, 0},
+        {207, 0, 255},
+        {37, 0, 255},
+        {0, 207, 255},
+        {94, 0, 255},
+        {0, 255, 113},
+        {255, 18, 0},
+        {255, 0, 56},
+        {18, 0, 255},
+        {0, 255, 226},
+        {170, 255, 0},
+        {255, 0, 245},
+        {151, 255, 0},
+        {132, 255, 0},
+        {75, 0, 255},
+        {151, 0, 255},
+        {0, 151, 255},
+        {132, 0, 255},
+        {0, 255, 245},
+        {255, 132, 0},
+        {226, 0, 255},
+        {255, 37, 0},
+        {207, 255, 0},
+        {0, 255, 207},
+        {94, 255, 0},
+        {0, 226, 255},
+        {56, 255, 0},
+        {255, 94, 0},
+        {255, 113, 0},
+        {0, 132, 255},
+        {255, 0, 132},
+        {255, 170, 0},
+        {255, 0, 188},
+        {113, 255, 0},
+        {245, 0, 255},
+        {113, 0, 255},
+        {255, 188, 0},
+        {0, 113, 255},
+        {255, 0, 0},
+        {0, 56, 255},
+        {255, 0, 113},
+        {0, 255, 188},
+        {255, 0, 94},
+        {255, 0, 18},
+        {18, 255, 0},
+        {0, 255, 132},
+        {0, 188, 255},
+        {0, 245, 255},
+        {0, 169, 255},
+        {37, 255, 0},
+        {255, 0, 151},
+        {188, 0, 255},
+        {0, 255, 37},
+        {0, 255, 0},
+        {255, 0, 170},
+        {255, 0, 37},
+        {255, 75, 0},
+        {0, 0, 255},
+        {255, 207, 0},
+        {255, 0, 226},
+        {255, 245, 0},
+        {188, 255, 0},
+        {0, 255, 18},
+        {0, 255, 75},
+        {0, 255, 151},
+        {255, 56, 0},
+        {245, 255, 0}
+    };
+
+    cv::Mat image = bgr.clone();
+
+    int color_index = 0;
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        if (obj.prob < 0.15)
+            continue;
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        const unsigned char* color = colors[color_index % 81];
+        color_index++;
+
+        cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2]));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+
+        // draw mask
+        for (int y = 0; y < image.rows; y++)
+        {
+            const uchar* mp = obj.mask.ptr(y);
+            uchar* p = image.ptr(y);
+            for (int x = 0; x < image.cols; x++)
+            {
+                if (mp[x] == 255)
+                {
+                    p[0] = cv::saturate_cast<uchar>(p[0] * 0.5 + color[0] * 0.5);
+                    p[1] = cv::saturate_cast<uchar>(p[1] * 0.5 + color[1] * 0.5);
+                    p[2] = cv::saturate_cast<uchar>(p[2] * 0.5 + color[2] * 0.5);
+                }
+                p += 3;
+            }
+        }
+    }
+
+    cv::imwrite("result.png", image);
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolact(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/yolov2.cpp b/examples/yolov2.cpp
new file mode 100644
index 0000000..111040f
--- /dev/null
+++ b/examples/yolov2.cpp
@@ -0,0 +1,158 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_yolov2(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov2;
+
+    yolov2.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/eric612/MobileNet-YOLO
+    // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy.prototxt
+    // https://github.com/eric612/MobileNet-YOLO/blob/master/models/yolov2/mobilenet_yolo_deploy_iter_80000.caffemodel
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (yolov2.load_param("mobilenet_yolo.param"))
+        exit(-1);
+    if (yolov2.load_model("mobilenet_yolo.bin"))
+        exit(-1);
+
+    const int target_size = 416;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    // the Caffe-YOLOv2-Windows style
+    // X' = X * scale - mean
+    const float mean_vals[3] = {1.0f, 1.0f, 1.0f};
+    const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f};
+    in.substract_mean_normalize(0, norm_vals);
+    in.substract_mean_normalize(mean_vals, 0);
+
+    ncnn::Extractor ex = yolov2.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov2(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/yolov3.cpp b/examples/yolov3.cpp
new file mode 100644
index 0000000..0417c05
--- /dev/null
+++ b/examples/yolov3.cpp
@@ -0,0 +1,155 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int detect_yolov3(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov3;
+
+    yolov3.opt.use_vulkan_compute = true;
+
+    // original pretrained model from https://github.com/eric612/MobileNet-YOLO
+    // param : https://drive.google.com/open?id=1V9oKHP6G6XvXZqhZbzNKL6FI_clRWdC-
+    // bin : https://drive.google.com/open?id=1DBcuFCr-856z3FRQznWL_S5h-Aj3RawA
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (yolov3.load_param("mobilenetv2_yolov3.param"))
+        exit(-1);
+    if (yolov3.load_model("mobilenetv2_yolov3.bin"))
+        exit(-1);
+
+    const int target_size = 352;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {127.5f, 127.5f, 127.5f};
+    const float norm_vals[3] = {0.007843f, 0.007843f, 0.007843f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = yolov3.create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("detection_out", out);
+
+    //     printf("%d %d %d\n", out.w, out.h, out.c);
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {"background",
+                                        "aeroplane", "bicycle", "bird", "boat",
+                                        "bottle", "bus", "car", "cat", "chair",
+                                        "cow", "diningtable", "dog", "horse",
+                                        "motorbike", "person", "pottedplant",
+                                        "sheep", "sofa", "train", "tvmonitor"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov3(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/yolov4.cpp b/examples/yolov4.cpp
new file mode 100644
index 0000000..764ce70
--- /dev/null
+++ b/examples/yolov4.cpp
@@ -0,0 +1,304 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "net.h"
+
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+
+#if CV_MAJOR_VERSION >= 3
+#include <opencv2/videoio/videoio.hpp>
+#endif
+
+#include <vector>
+
+#include <stdio.h>
+
+#define NCNN_PROFILING
+#define YOLOV4_TINY //Using yolov4_tiny, if undef, using original yolov4
+
+#ifdef NCNN_PROFILING
+#include "benchmark.h"
+#endif
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static int init_yolov4(ncnn::Net* yolov4, int* target_size)
+{
+    /* --> Set the params you need for the ncnn inference <-- */
+
+    yolov4->opt.num_threads = 4; //You need to compile with libgomp for multi thread support
+
+    yolov4->opt.use_vulkan_compute = true; //You need to compile with libvulkan for gpu support
+
+    yolov4->opt.use_winograd_convolution = true;
+    yolov4->opt.use_sgemm_convolution = true;
+    yolov4->opt.use_fp16_packed = true;
+    yolov4->opt.use_fp16_storage = true;
+    yolov4->opt.use_fp16_arithmetic = true;
+    yolov4->opt.use_packing_layout = true;
+    yolov4->opt.use_shader_pack8 = false;
+    yolov4->opt.use_image_storage = false;
+
+    /* --> End of setting params <-- */
+    int ret = 0;
+
+    // original pretrained model from https://github.com/AlexeyAB/darknet
+    // the ncnn model https://drive.google.com/drive/folders/1YzILvh0SKQPS_lrb33dmGNq7aVTKPWS0?usp=sharing
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+#ifdef YOLOV4_TINY
+    const char* yolov4_param = "yolov4-tiny-opt.param";
+    const char* yolov4_model = "yolov4-tiny-opt.bin";
+    *target_size = 416;
+#else
+    const char* yolov4_param = "yolov4-opt.param";
+    const char* yolov4_model = "yolov4-opt.bin";
+    *target_size = 608;
+#endif
+
+    if (yolov4->load_param(yolov4_param))
+        exit(-1);
+    if (yolov4->load_model(yolov4_model))
+        exit(-1);
+
+    return 0;
+}
+
+static int detect_yolov4(const cv::Mat& bgr, std::vector<Object>& objects, int target_size, ncnn::Net* yolov4)
+{
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, bgr.cols, bgr.rows, target_size, target_size);
+
+    const float mean_vals[3] = {0, 0, 0};
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in.substract_mean_normalize(mean_vals, norm_vals);
+
+    ncnn::Extractor ex = yolov4->create_extractor();
+
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("output", out);
+
+    objects.clear();
+    for (int i = 0; i < out.h; i++)
+    {
+        const float* values = out.row(i);
+
+        Object object;
+        object.label = values[0];
+        object.prob = values[1];
+        object.rect.x = values[2] * img_w;
+        object.rect.y = values[3] * img_h;
+        object.rect.width = values[4] * img_w - object.rect.x;
+        object.rect.height = values[5] * img_h - object.rect.y;
+
+        objects.push_back(object);
+    }
+
+    return 0;
+}
+
+static int draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects, int is_streaming)
+{
+    static const char* class_names[] = {"background", "person", "bicycle",
+                                        "car", "motorbike", "aeroplane", "bus", "train", "truck",
+                                        "boat", "traffic light", "fire hydrant", "stop sign",
+                                        "parking meter", "bench", "bird", "cat", "dog", "horse",
+                                        "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
+                                        "backpack", "umbrella", "handbag", "tie", "suitcase",
+                                        "frisbee", "skis", "snowboard", "sports ball", "kite",
+                                        "baseball bat", "baseball glove", "skateboard", "surfboard",
+                                        "tennis racket", "bottle", "wine glass", "cup", "fork",
+                                        "knife", "spoon", "bowl", "banana", "apple", "sandwich",
+                                        "orange", "broccoli", "carrot", "hot dog", "pizza", "donut",
+                                        "cake", "chair", "sofa", "pottedplant", "bed", "diningtable",
+                                        "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard",
+                                        "cell phone", "microwave", "oven", "toaster", "sink",
+                                        "refrigerator", "book", "clock", "vase", "scissors",
+                                        "teddy bear", "hair drier", "toothbrush"
+                                       };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+
+    if (is_streaming)
+    {
+        cv::waitKey(1);
+    }
+    else
+    {
+        cv::waitKey(0);
+    }
+
+    return 0;
+}
+
+int main(int argc, char** argv)
+{
+    cv::Mat frame;
+    std::vector<Object> objects;
+
+    cv::VideoCapture cap;
+
+    ncnn::Net yolov4;
+
+    const char* devicepath;
+
+    int target_size = 0;
+    int is_streaming = 0;
+
+    if (argc < 2)
+    {
+        fprintf(stderr, "Usage: %s [v4l input device or image]\n", argv[0]);
+        return -1;
+    }
+
+    devicepath = argv[1];
+
+#ifdef NCNN_PROFILING
+    double t_load_start = ncnn::get_current_time();
+#endif
+
+    int ret = init_yolov4(&yolov4, &target_size); //We load model and param first!
+    if (ret != 0)
+    {
+        fprintf(stderr, "Failed to load model or param, error %d", ret);
+        return -1;
+    }
+
+#ifdef NCNN_PROFILING
+    double t_load_end = ncnn::get_current_time();
+    fprintf(stdout, "NCNN Init time %.02lfms\n", t_load_end - t_load_start);
+#endif
+
+    if (strstr(devicepath, "/dev/video") == NULL)
+    {
+        frame = cv::imread(argv[1], 1);
+        if (frame.empty())
+        {
+            fprintf(stderr, "Failed to read image %s.\n", argv[1]);
+            return -1;
+        }
+    }
+    else
+    {
+        cap.open(devicepath);
+
+        if (!cap.isOpened())
+        {
+            fprintf(stderr, "Failed to open %s", devicepath);
+            return -1;
+        }
+
+        cap >> frame;
+
+        if (frame.empty())
+        {
+            fprintf(stderr, "Failed to read from device %s.\n", devicepath);
+            return -1;
+        }
+
+        is_streaming = 1;
+    }
+
+    while (1)
+    {
+        if (is_streaming)
+        {
+#ifdef NCNN_PROFILING
+            double t_capture_start = ncnn::get_current_time();
+#endif
+
+            cap >> frame;
+
+#ifdef NCNN_PROFILING
+            double t_capture_end = ncnn::get_current_time();
+            fprintf(stdout, "NCNN OpenCV capture time %.02lfms\n", t_capture_end - t_capture_start);
+#endif
+            if (frame.empty())
+            {
+                fprintf(stderr, "OpenCV Failed to Capture from device %s\n", devicepath);
+                return -1;
+            }
+        }
+
+#ifdef NCNN_PROFILING
+        double t_detect_start = ncnn::get_current_time();
+#endif
+
+        detect_yolov4(frame, objects, target_size, &yolov4); //Create an extractor and run detection
+
+#ifdef NCNN_PROFILING
+        double t_detect_end = ncnn::get_current_time();
+        fprintf(stdout, "NCNN detection time %.02lfms\n", t_detect_end - t_detect_start);
+#endif
+
+#ifdef NCNN_PROFILING
+        double t_draw_start = ncnn::get_current_time();
+#endif
+
+        draw_objects(frame, objects, is_streaming); //Draw detection results on opencv image
+
+#ifdef NCNN_PROFILING
+        double t_draw_end = ncnn::get_current_time();
+        fprintf(stdout, "NCNN OpenCV draw result time %.02lfms\n", t_draw_end - t_draw_start);
+#endif
+
+        if (!is_streaming)
+        {   //If it is a still image, exit!
+            return 0;
+        }
+    }
+
+    return 0;
+}
diff --git a/examples/yolov5.cpp b/examples/yolov5.cpp
new file mode 100644
index 0000000..88f6db2
--- /dev/null
+++ b/examples/yolov5.cpp
@@ -0,0 +1,521 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+//#define YOLOV5_V60 1 //YOLOv5 v6.0
+#define YOLOV5_V62 1 //YOLOv5 v6.2 export  onnx model method https://github.com/shaoshengsong/yolov5_62_export_ncnn
+
+#if YOLOV5_V60 || YOLOV5_V62
+#define MAX_STRIDE 64
+#else
+#define MAX_STRIDE 32
+class YoloV5Focus : public ncnn::Layer
+{
+public:
+    YoloV5Focus()
+    {
+        one_blob_only = true;
+    }
+
+    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int outw = w / 2;
+        int outh = h / 2;
+        int outc = channels * 4;
+
+        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outc; p++)
+        {
+            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
+            float* outptr = top_blob.channel(p);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    *outptr = *ptr;
+
+                    outptr += 1;
+                    ptr += 2;
+                }
+
+                ptr += w;
+            }
+        }
+
+        return 0;
+    }
+};
+
+DEFINE_LAYER_CREATOR(YoloV5Focus)
+#endif //YOLOV5_V60    YOLOV5_V62
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = feat_blob.h;
+
+    int num_grid_x;
+    int num_grid_y;
+    if (in_pad.w > in_pad.h)
+    {
+        num_grid_x = in_pad.w / stride;
+        num_grid_y = num_grid / num_grid_x;
+    }
+    else
+    {
+        num_grid_y = in_pad.h / stride;
+        num_grid_x = num_grid / num_grid_y;
+    }
+
+    const int num_class = feat_blob.w - 5;
+
+    const int num_anchors = anchors.w / 2;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float anchor_w = anchors[q * 2];
+        const float anchor_h = anchors[q * 2 + 1];
+
+        const ncnn::Mat feat = feat_blob.channel(q);
+
+        for (int i = 0; i < num_grid_y; i++)
+        {
+            for (int j = 0; j < num_grid_x; j++)
+            {
+                const float* featptr = feat.row(i * num_grid_x + j);
+                float box_confidence = sigmoid(featptr[4]);
+                if (box_confidence >= prob_threshold)
+                {
+                    // find class index with max class score
+                    int class_index = 0;
+                    float class_score = -FLT_MAX;
+                    for (int k = 0; k < num_class; k++)
+                    {
+                        float score = featptr[5 + k];
+                        if (score > class_score)
+                        {
+                            class_index = k;
+                            class_score = score;
+                        }
+                    }
+                    float confidence = box_confidence * sigmoid(class_score);
+                    if (confidence >= prob_threshold)
+                    {
+                        // yolov5/models/yolo.py Detect forward
+                        // y = x[i].sigmoid()
+                        // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+                        // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+
+                        float dx = sigmoid(featptr[0]);
+                        float dy = sigmoid(featptr[1]);
+                        float dw = sigmoid(featptr[2]);
+                        float dh = sigmoid(featptr[3]);
+
+                        float pb_cx = (dx * 2.f - 0.5f + j) * stride;
+                        float pb_cy = (dy * 2.f - 0.5f + i) * stride;
+
+                        float pb_w = pow(dw * 2.f, 2) * anchor_w;
+                        float pb_h = pow(dh * 2.f, 2) * anchor_h;
+
+                        float x0 = pb_cx - pb_w * 0.5f;
+                        float y0 = pb_cy - pb_h * 0.5f;
+                        float x1 = pb_cx + pb_w * 0.5f;
+                        float y1 = pb_cy + pb_h * 0.5f;
+
+                        Object obj;
+                        obj.rect.x = x0;
+                        obj.rect.y = y0;
+                        obj.rect.width = x1 - x0;
+                        obj.rect.height = y1 - y0;
+                        obj.label = class_index;
+                        obj.prob = confidence;
+
+                        objects.push_back(obj);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov5;
+
+    yolov5.opt.use_vulkan_compute = true;
+    // yolov5.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/ultralytics/yolov5
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+#if YOLOV5_V62
+    if (yolov5.load_param("yolov5s_6.2.param"))
+        exit(-1);
+    if (yolov5.load_model("yolov5s_6.2.bin"))
+        exit(-1);
+#elif YOLOV5_V60
+    if (yolov5.load_param("yolov5s_6.0.param"))
+        exit(-1);
+    if (yolov5.load_model("yolov5s_6.0.bin"))
+        exit(-1);
+#else
+    yolov5.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
+
+    if (yolov5.load_param("yolov5s.param"))
+        exit(-1);
+    if (yolov5.load_model("yolov5s.bin"))
+        exit(-1);
+#endif
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // letterbox pad to multiple of MAX_STRIDE
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    // pad to target_size rectangle
+    // yolov5/utils/datasets.py letterbox
+    int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
+    int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov5.create_extractor();
+
+    ex.input("images", in_pad);
+
+    std::vector<Object> proposals;
+
+    // anchor setting from yolov5/models/yolov5s.yaml
+
+    // stride 8
+    {
+        ncnn::Mat out;
+        ex.extract("output", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 10.f;
+        anchors[1] = 13.f;
+        anchors[2] = 16.f;
+        anchors[3] = 30.f;
+        anchors[4] = 33.f;
+        anchors[5] = 23.f;
+
+        std::vector<Object> objects8;
+        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat out;
+
+#if YOLOV5_V62
+        ex.extract("353", out);
+#elif YOLOV5_V60
+        ex.extract("376", out);
+#else
+        ex.extract("781", out);
+#endif
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 30.f;
+        anchors[1] = 61.f;
+        anchors[2] = 62.f;
+        anchors[3] = 45.f;
+        anchors[4] = 59.f;
+        anchors[5] = 119.f;
+
+        std::vector<Object> objects16;
+        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat out;
+#if YOLOV5_V62
+        ex.extract("367", out);
+#elif YOLOV5_V60
+        ex.extract("401", out);
+#else
+        ex.extract("801", out);
+#endif
+        ncnn::Mat anchors(6);
+        anchors[0] = 116.f;
+        anchors[1] = 90.f;
+        anchors[2] = 156.f;
+        anchors[3] = 198.f;
+        anchors[4] = 373.f;
+        anchors[5] = 326.f;
+
+        std::vector<Object> objects32;
+        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov5(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/yolov5_pnnx.cpp b/examples/yolov5_pnnx.cpp
new file mode 100644
index 0000000..5d01903
--- /dev/null
+++ b/examples/yolov5_pnnx.cpp
@@ -0,0 +1,429 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid_x = feat_blob.w;
+    const int num_grid_y = feat_blob.h;
+
+    const int num_anchors = anchors.w / 2;
+
+    const int num_class = feat_blob.c / num_anchors - 5;
+
+    const int feat_offset = num_class + 5;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float anchor_w = anchors[q * 2];
+        const float anchor_h = anchors[q * 2 + 1];
+
+        for (int i = 0; i < num_grid_y; i++)
+        {
+            for (int j = 0; j < num_grid_x; j++)
+            {
+                // find class index with max class score
+                int class_index = 0;
+                float class_score = -FLT_MAX;
+                for (int k = 0; k < num_class; k++)
+                {
+                    float score = feat_blob.channel(q * feat_offset + 5 + k).row(i)[j];
+                    if (score > class_score)
+                    {
+                        class_index = k;
+                        class_score = score;
+                    }
+                }
+
+                float box_score = feat_blob.channel(q * feat_offset + 4).row(i)[j];
+
+                float confidence = sigmoid(box_score) * sigmoid(class_score);
+
+                if (confidence >= prob_threshold)
+                {
+                    // yolov5/models/yolo.py Detect forward
+                    // y = x[i].sigmoid()
+                    // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+                    // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+
+                    float dx = sigmoid(feat_blob.channel(q * feat_offset + 0).row(i)[j]);
+                    float dy = sigmoid(feat_blob.channel(q * feat_offset + 1).row(i)[j]);
+                    float dw = sigmoid(feat_blob.channel(q * feat_offset + 2).row(i)[j]);
+                    float dh = sigmoid(feat_blob.channel(q * feat_offset + 3).row(i)[j]);
+
+                    float pb_cx = (dx * 2.f - 0.5f + j) * stride;
+                    float pb_cy = (dy * 2.f - 0.5f + i) * stride;
+
+                    float pb_w = pow(dw * 2.f, 2) * anchor_w;
+                    float pb_h = pow(dh * 2.f, 2) * anchor_h;
+
+                    float x0 = pb_cx - pb_w * 0.5f;
+                    float y0 = pb_cy - pb_h * 0.5f;
+                    float x1 = pb_cx + pb_w * 0.5f;
+                    float y1 = pb_cy + pb_h * 0.5f;
+
+                    Object obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0;
+                    obj.rect.height = y1 - y0;
+                    obj.label = class_index;
+                    obj.prob = confidence;
+
+                    objects.push_back(obj);
+                }
+            }
+        }
+    }
+}
+
+static int detect_yolov5(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov5;
+
+    yolov5.opt.use_vulkan_compute = true;
+    // yolov5.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/ultralytics/yolov5
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    if (yolov5.load_param("yolov5s.ncnn.param"))
+        exit(-1);
+    if (yolov5.load_model("yolov5s.ncnn.bin"))
+        exit(-1);
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // yolov5/models/common.py DetectMultiBackend
+    const int max_stride = 64;
+
+    // letterbox pad to multiple of max_stride
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    // pad to target_size rectangle
+    // yolov5/utils/datasets.py letterbox
+    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
+    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov5.create_extractor();
+
+    ex.input("in0", in_pad);
+
+    std::vector<Object> proposals;
+
+    // anchor setting from yolov5/models/yolov5s.yaml
+
+    // stride 8
+    {
+        ncnn::Mat out;
+        ex.extract("out0", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 10.f;
+        anchors[1] = 13.f;
+        anchors[2] = 16.f;
+        anchors[3] = 30.f;
+        anchors[4] = 33.f;
+        anchors[5] = 23.f;
+
+        std::vector<Object> objects8;
+        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat out;
+        ex.extract("out1", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 30.f;
+        anchors[1] = 61.f;
+        anchors[2] = 62.f;
+        anchors[3] = 45.f;
+        anchors[4] = 59.f;
+        anchors[5] = 119.f;
+
+        std::vector<Object> objects16;
+        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat out;
+        ex.extract("out2", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 116.f;
+        anchors[1] = 90.f;
+        anchors[2] = 156.f;
+        anchors[3] = 198.f;
+        anchors[4] = 373.f;
+        anchors[5] = 326.f;
+
+        std::vector<Object> objects32;
+        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov5(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/yolov7.cpp b/examples/yolov7.cpp
new file mode 100644
index 0000000..7898185
--- /dev/null
+++ b/examples/yolov7.cpp
@@ -0,0 +1,461 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+#define MAX_STRIDE 32
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = objects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (objects[i].prob > p)
+            i++;
+
+        while (objects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(objects[i], objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(objects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(objects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = feat_blob.h;
+
+    int num_grid_x;
+    int num_grid_y;
+    if (in_pad.w > in_pad.h)
+    {
+        num_grid_x = in_pad.w / stride;
+        num_grid_y = num_grid / num_grid_x;
+    }
+    else
+    {
+        num_grid_y = in_pad.h / stride;
+        num_grid_x = num_grid / num_grid_y;
+    }
+
+    const int num_class = feat_blob.w - 5;
+
+    const int num_anchors = anchors.w / 2;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float anchor_w = anchors[q * 2];
+        const float anchor_h = anchors[q * 2 + 1];
+
+        const ncnn::Mat feat = feat_blob.channel(q);
+
+        for (int i = 0; i < num_grid_y; i++)
+        {
+            for (int j = 0; j < num_grid_x; j++)
+            {
+                const float* featptr = feat.row(i * num_grid_x + j);
+                float box_confidence = sigmoid(featptr[4]);
+                if (box_confidence >= prob_threshold)
+                {
+                    // find class index with max class score
+                    int class_index = 0;
+                    float class_score = -FLT_MAX;
+                    for (int k = 0; k < num_class; k++)
+                    {
+                        float score = featptr[5 + k];
+                        if (score > class_score)
+                        {
+                            class_index = k;
+                            class_score = score;
+                        }
+                    }
+                    float confidence = box_confidence * sigmoid(class_score);
+                    if (confidence >= prob_threshold)
+                    {
+                        float dx = sigmoid(featptr[0]);
+                        float dy = sigmoid(featptr[1]);
+                        float dw = sigmoid(featptr[2]);
+                        float dh = sigmoid(featptr[3]);
+
+                        float pb_cx = (dx * 2.f - 0.5f + j) * stride;
+                        float pb_cy = (dy * 2.f - 0.5f + i) * stride;
+
+                        float pb_w = pow(dw * 2.f, 2) * anchor_w;
+                        float pb_h = pow(dh * 2.f, 2) * anchor_h;
+
+                        float x0 = pb_cx - pb_w * 0.5f;
+                        float y0 = pb_cy - pb_h * 0.5f;
+                        float x1 = pb_cx + pb_w * 0.5f;
+                        float y1 = pb_cy + pb_h * 0.5f;
+
+                        Object obj;
+                        obj.rect.x = x0;
+                        obj.rect.y = y0;
+                        obj.rect.width = x1 - x0;
+                        obj.rect.height = y1 - y0;
+                        obj.label = class_index;
+                        obj.prob = confidence;
+
+                        objects.push_back(obj);
+                    }
+                }
+            }
+        }
+    }
+}
+
+static int detect_yolov7(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov7;
+
+    yolov7.opt.use_vulkan_compute = true;
+    // yolov7.opt.use_bf16_storage = true;
+
+    // original pretrained model from https://github.com/WongKinYiu/yolov7
+    // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
+    yolov7.load_param("yolov7-tiny.param");
+    yolov7.load_model("yolov7-tiny.bin");
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // letterbox pad to multiple of MAX_STRIDE
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    int wpad = (w + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - w;
+    int hpad = (h + MAX_STRIDE - 1) / MAX_STRIDE * MAX_STRIDE - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov7.create_extractor();
+
+    ex.input("images", in_pad);
+
+    std::vector<Object> proposals;
+
+    // stride 8
+    {
+        ncnn::Mat out;
+        ex.extract("output", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 12.f;
+        anchors[1] = 16.f;
+        anchors[2] = 19.f;
+        anchors[3] = 36.f;
+        anchors[4] = 40.f;
+        anchors[5] = 28.f;
+
+        std::vector<Object> objects8;
+        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat out;
+
+        ex.extract("288", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 36.f;
+        anchors[1] = 75.f;
+        anchors[2] = 76.f;
+        anchors[3] = 55.f;
+        anchors[4] = 72.f;
+        anchors[5] = 146.f;
+
+        std::vector<Object> objects16;
+        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat out;
+
+        ex.extract("302", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 142.f;
+        anchors[1] = 110.f;
+        anchors[2] = 192.f;
+        anchors[3] = 243.f;
+        anchors[4] = 459.f;
+        anchors[5] = 401.f;
+
+        std::vector<Object> objects32;
+        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    static const unsigned char colors[19][3] = {
+        {54, 67, 244},
+        {99, 30, 233},
+        {176, 39, 156},
+        {183, 58, 103},
+        {181, 81, 63},
+        {243, 150, 33},
+        {244, 169, 3},
+        {212, 188, 0},
+        {136, 150, 0},
+        {80, 175, 76},
+        {74, 195, 139},
+        {57, 220, 205},
+        {59, 235, 255},
+        {7, 193, 255},
+        {0, 152, 255},
+        {34, 87, 255},
+        {72, 85, 121},
+        {158, 158, 158},
+        {139, 125, 96}
+    };
+
+    int color_index = 0;
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        const unsigned char* color = colors[color_index % 19];
+        color_index++;
+
+        cv::Scalar cc(color[0], color[1], color[2]);
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cc, 2);
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cc, -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(255, 255, 255));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov7(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/yolov7_pnnx.cpp b/examples/yolov7_pnnx.cpp
new file mode 100644
index 0000000..3dc7b41
--- /dev/null
+++ b/examples/yolov7_pnnx.cpp
@@ -0,0 +1,428 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects)
+{
+    if (faceobjects.empty())
+        return;
+
+    qsort_descent_inplace(faceobjects, 0, faceobjects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static inline float sigmoid(float x)
+{
+    return static_cast<float>(1.f / (1.f + exp(-x)));
+}
+
+static void generate_proposals(const ncnn::Mat& anchors, int stride, const ncnn::Mat& in_pad, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid_x = feat_blob.w;
+    const int num_grid_y = feat_blob.h;
+
+    const int num_anchors = anchors.w / 2;
+
+    const int num_class = 80;
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const float anchor_w = anchors[q * 2];
+        const float anchor_h = anchors[q * 2 + 1];
+
+        for (int i = 0; i < num_grid_y; i++)
+        {
+            for (int j = 0; j < num_grid_x; j++)
+            {
+                // find class index with max class score
+                int class_index = 0;
+                float class_score = -FLT_MAX;
+                for (int k = 0; k < num_class; k++)
+                {
+                    float score = feat_blob.channel(q * 85 + 5 + k).row(i)[j];
+                    if (score > class_score)
+                    {
+                        class_index = k;
+                        class_score = score;
+                    }
+                }
+
+                float box_score = feat_blob.channel(q * 85 + 4).row(i)[j];
+
+                float confidence = sigmoid(box_score) * sigmoid(class_score);
+
+                if (confidence >= prob_threshold)
+                {
+                    // yolov5/models/yolo.py Detect forward
+                    // y = x[i].sigmoid()
+                    // y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i].to(x[i].device)) * self.stride[i]  # xy
+                    // y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh
+
+                    float dx = sigmoid(feat_blob.channel(q * 85 + 0).row(i)[j]);
+                    float dy = sigmoid(feat_blob.channel(q * 85 + 1).row(i)[j]);
+                    float dw = sigmoid(feat_blob.channel(q * 85 + 2).row(i)[j]);
+                    float dh = sigmoid(feat_blob.channel(q * 85 + 3).row(i)[j]);
+
+                    float pb_cx = (dx * 2.f - 0.5f + j) * stride;
+                    float pb_cy = (dy * 2.f - 0.5f + i) * stride;
+
+                    float pb_w = pow(dw * 2.f, 2) * anchor_w;
+                    float pb_h = pow(dh * 2.f, 2) * anchor_h;
+
+                    float x0 = pb_cx - pb_w * 0.5f;
+                    float y0 = pb_cy - pb_h * 0.5f;
+                    float x1 = pb_cx + pb_w * 0.5f;
+                    float y1 = pb_cy + pb_h * 0.5f;
+
+                    Object obj;
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.width = x1 - x0;
+                    obj.rect.height = y1 - y0;
+                    obj.label = class_index;
+                    obj.prob = confidence;
+
+                    objects.push_back(obj);
+                }
+            }
+        }
+    }
+}
+
+static int detect_yolov7(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolov7;
+
+    yolov7.opt.use_vulkan_compute = true;
+    // yolov7.opt.use_bf16_storage = true;
+
+    // git clone https://github.com/WongKinYiu/yolov7
+    // cd yolov7
+    // wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
+    // python models/export.py --weights yolov7.pt
+    // pnnx yolov7.torchscript.pt inputshape=[1,3,640,640] inputshape=[1,3,320,320]
+    yolov7.load_param("yolov7.param");
+    yolov7.load_model("yolov7.bin");
+
+    const int target_size = 640;
+    const float prob_threshold = 0.25f;
+    const float nms_threshold = 0.45f;
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    // yolov5/models/common.py DetectMultiBackend
+    const int max_stride = 64;
+
+    // letterbox pad to multiple of max_stride
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)target_size / w;
+        w = target_size;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)target_size / h;
+        h = target_size;
+        w = w * scale;
+    }
+
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, w, h);
+
+    // pad to target_size rectangle
+    // yolov5/utils/datasets.py letterbox
+    int wpad = (w + max_stride - 1) / max_stride * max_stride - w;
+    int hpad = (h + max_stride - 1) / max_stride * max_stride - h;
+    ncnn::Mat in_pad;
+    ncnn::copy_make_border(in, in_pad, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, ncnn::BORDER_CONSTANT, 114.f);
+
+    const float norm_vals[3] = {1 / 255.f, 1 / 255.f, 1 / 255.f};
+    in_pad.substract_mean_normalize(0, norm_vals);
+
+    ncnn::Extractor ex = yolov7.create_extractor();
+
+    ex.input("in0", in_pad);
+
+    std::vector<Object> proposals;
+
+    // anchor setting from yolov5/models/yolov5s.yaml
+
+    // stride 8
+    {
+        ncnn::Mat out;
+        ex.extract("out0", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 12.f;
+        anchors[1] = 16.f;
+        anchors[2] = 19.f;
+        anchors[3] = 36.f;
+        anchors[4] = 40.f;
+        anchors[5] = 28.f;
+
+        std::vector<Object> objects8;
+        generate_proposals(anchors, 8, in_pad, out, prob_threshold, objects8);
+
+        proposals.insert(proposals.end(), objects8.begin(), objects8.end());
+    }
+
+    // stride 16
+    {
+        ncnn::Mat out;
+        ex.extract("out1", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 36.f;
+        anchors[1] = 75.f;
+        anchors[2] = 76.f;
+        anchors[3] = 55.f;
+        anchors[4] = 72.f;
+        anchors[5] = 146.f;
+
+        std::vector<Object> objects16;
+        generate_proposals(anchors, 16, in_pad, out, prob_threshold, objects16);
+
+        proposals.insert(proposals.end(), objects16.begin(), objects16.end());
+    }
+
+    // stride 32
+    {
+        ncnn::Mat out;
+        ex.extract("out2", out);
+
+        ncnn::Mat anchors(6);
+        anchors[0] = 142.f;
+        anchors[1] = 110.f;
+        anchors[2] = 192.f;
+        anchors[3] = 243.f;
+        anchors[4] = 459.f;
+        anchors[5] = 401.f;
+
+        std::vector<Object> objects32;
+        generate_proposals(anchors, 32, in_pad, out, prob_threshold, objects32);
+
+        proposals.insert(proposals.end(), objects32.begin(), objects32.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, nms_threshold);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x - (wpad / 2)) / scale;
+        float y0 = (objects[i].rect.y - (hpad / 2)) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width - (wpad / 2)) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height - (hpad / 2)) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolov7(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}
diff --git a/examples/yolox.cpp b/examples/yolox.cpp
new file mode 100644
index 0000000..65e40e2
--- /dev/null
+++ b/examples/yolox.cpp
@@ -0,0 +1,424 @@
+// This file is wirtten base on the following file:
+// https://github.com/Tencent/ncnn/blob/master/examples/yolov5.cpp
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+// ------------------------------------------------------------------------------
+// Copyright (C) 2020-2021, Megvii Inc. All rights reserved.
+
+#include "layer.h"
+#include "net.h"
+
+#if defined(USE_NCNN_SIMPLEOCV)
+#include "simpleocv.h"
+#else
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#endif
+#include <float.h>
+#include <stdio.h>
+#include <vector>
+
+#define YOLOX_NMS_THRESH  0.45 // nms threshold
+#define YOLOX_CONF_THRESH 0.25 // threshold of bounding box prob
+#define YOLOX_TARGET_SIZE 640  // target image size after resize, might use 416 for small model
+
+// YOLOX use the same focus in yolov5
+class YoloV5Focus : public ncnn::Layer
+{
+public:
+    YoloV5Focus()
+    {
+        one_blob_only = true;
+    }
+
+    virtual int forward(const ncnn::Mat& bottom_blob, ncnn::Mat& top_blob, const ncnn::Option& opt) const
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+
+        int outw = w / 2;
+        int outh = h / 2;
+        int outc = channels * 4;
+
+        top_blob.create(outw, outh, outc, 4u, 1, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < outc; p++)
+        {
+            const float* ptr = bottom_blob.channel(p % channels).row((p / channels) % 2) + ((p / channels) / 2);
+            float* outptr = top_blob.channel(p);
+
+            for (int i = 0; i < outh; i++)
+            {
+                for (int j = 0; j < outw; j++)
+                {
+                    *outptr = *ptr;
+
+                    outptr += 1;
+                    ptr += 2;
+                }
+
+                ptr += w;
+            }
+        }
+
+        return 0;
+    }
+};
+
+DEFINE_LAYER_CREATOR(YoloV5Focus)
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+struct GridAndStride
+{
+    int grid0;
+    int grid1;
+    int stride;
+};
+
+static inline float intersection_area(const Object& a, const Object& b)
+{
+    cv::Rect_<float> inter = a.rect & b.rect;
+    return inter.area();
+}
+
+static void qsort_descent_inplace(std::vector<Object>& faceobjects, int left, int right)
+{
+    int i = left;
+    int j = right;
+    float p = faceobjects[(left + right) / 2].prob;
+
+    while (i <= j)
+    {
+        while (faceobjects[i].prob > p)
+            i++;
+
+        while (faceobjects[j].prob < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(faceobjects[i], faceobjects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    #pragma omp parallel sections
+    {
+        #pragma omp section
+        {
+            if (left < j) qsort_descent_inplace(faceobjects, left, j);
+        }
+        #pragma omp section
+        {
+            if (i < right) qsort_descent_inplace(faceobjects, i, right);
+        }
+    }
+}
+
+static void qsort_descent_inplace(std::vector<Object>& objects)
+{
+    if (objects.empty())
+        return;
+
+    qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object>& faceobjects, std::vector<int>& picked, float nms_threshold, bool agnostic = false)
+{
+    picked.clear();
+
+    const int n = faceobjects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = faceobjects[i].rect.area();
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Object& a = faceobjects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Object& b = faceobjects[picked[j]];
+
+            if (!agnostic && a.label != b.label)
+                continue;
+
+            // intersection over union
+            float inter_area = intersection_area(a, b);
+            float union_area = areas[i] + areas[picked[j]] - inter_area;
+            // float IoU = inter_area / union_area
+            if (inter_area / union_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+static void generate_grids_and_stride(const int target_w, const int target_h, std::vector<int>& strides, std::vector<GridAndStride>& grid_strides)
+{
+    for (int i = 0; i < (int)strides.size(); i++)
+    {
+        int stride = strides[i];
+        int num_grid_w = target_w / stride;
+        int num_grid_h = target_h / stride;
+        for (int g1 = 0; g1 < num_grid_h; g1++)
+        {
+            for (int g0 = 0; g0 < num_grid_w; g0++)
+            {
+                GridAndStride gs;
+                gs.grid0 = g0;
+                gs.grid1 = g1;
+                gs.stride = stride;
+                grid_strides.push_back(gs);
+            }
+        }
+    }
+}
+
+static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides, const ncnn::Mat& feat_blob, float prob_threshold, std::vector<Object>& objects)
+{
+    const int num_grid = feat_blob.h;
+    const int num_class = feat_blob.w - 5;
+    const int num_anchors = grid_strides.size();
+
+    const float* feat_ptr = feat_blob.channel(0);
+    for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++)
+    {
+        const int grid0 = grid_strides[anchor_idx].grid0;
+        const int grid1 = grid_strides[anchor_idx].grid1;
+        const int stride = grid_strides[anchor_idx].stride;
+
+        // yolox/models/yolo_head.py decode logic
+        //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
+        //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+        float x_center = (feat_ptr[0] + grid0) * stride;
+        float y_center = (feat_ptr[1] + grid1) * stride;
+        float w = exp(feat_ptr[2]) * stride;
+        float h = exp(feat_ptr[3]) * stride;
+        float x0 = x_center - w * 0.5f;
+        float y0 = y_center - h * 0.5f;
+
+        float box_objectness = feat_ptr[4];
+        for (int class_idx = 0; class_idx < num_class; class_idx++)
+        {
+            float box_cls_score = feat_ptr[5 + class_idx];
+            float box_prob = box_objectness * box_cls_score;
+            if (box_prob > prob_threshold)
+            {
+                Object obj;
+                obj.rect.x = x0;
+                obj.rect.y = y0;
+                obj.rect.width = w;
+                obj.rect.height = h;
+                obj.label = class_idx;
+                obj.prob = box_prob;
+
+                objects.push_back(obj);
+            }
+
+        } // class loop
+        feat_ptr += feat_blob.w;
+
+    } // point anchor loop
+}
+
+static int detect_yolox(const cv::Mat& bgr, std::vector<Object>& objects)
+{
+    ncnn::Net yolox;
+
+    yolox.opt.use_vulkan_compute = true;
+    // yolox.opt.use_bf16_storage = true;
+
+    // Focus in yolov5
+    yolox.register_custom_layer("YoloV5Focus", YoloV5Focus_layer_creator);
+
+    // original pretrained model from https://github.com/Megvii-BaseDetection/YOLOX
+    // ncnn model param: https://github.com/Megvii-BaseDetection/YOLOX/releases/download/0.1.1rc0/yolox_s_ncnn.tar.gz
+    // NOTE that newest version YOLOX remove normalization of model (minus mean and then div by std),
+    // which might cause your model outputs becoming a total mess, plz check carefully.
+    if (yolox.load_param("yolox.param"))
+        exit(-1);
+    if (yolox.load_model("yolox.bin"))
+        exit(-1);
+
+    int img_w = bgr.cols;
+    int img_h = bgr.rows;
+
+    int w = img_w;
+    int h = img_h;
+    float scale = 1.f;
+    if (w > h)
+    {
+        scale = (float)YOLOX_TARGET_SIZE / w;
+        w = YOLOX_TARGET_SIZE;
+        h = h * scale;
+    }
+    else
+    {
+        scale = (float)YOLOX_TARGET_SIZE / h;
+        h = YOLOX_TARGET_SIZE;
+        w = w * scale;
+    }
+    ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR, img_w, img_h, w, h);
+
+    // pad to YOLOX_TARGET_SIZE rectangle
+    int wpad = (w + 31) / 32 * 32 - w;
+    int hpad = (h + 31) / 32 * 32 - h;
+    ncnn::Mat in_pad;
+    // different from yolov5, yolox only pad on bottom and right side,
+    // which means users don't need to extra padding info to decode boxes coordinate.
+    ncnn::copy_make_border(in, in_pad, 0, hpad, 0, wpad, ncnn::BORDER_CONSTANT, 114.f);
+
+    ncnn::Extractor ex = yolox.create_extractor();
+
+    ex.input("images", in_pad);
+
+    std::vector<Object> proposals;
+
+    {
+        ncnn::Mat out;
+        ex.extract("output", out);
+
+        static const int stride_arr[] = {8, 16, 32}; // might have stride=64 in YOLOX
+        std::vector<int> strides(stride_arr, stride_arr + sizeof(stride_arr) / sizeof(stride_arr[0]));
+        std::vector<GridAndStride> grid_strides;
+        generate_grids_and_stride(in_pad.w, in_pad.h, strides, grid_strides);
+        generate_yolox_proposals(grid_strides, out, YOLOX_CONF_THRESH, proposals);
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_bboxes(proposals, picked, YOLOX_NMS_THRESH);
+
+    int count = picked.size();
+
+    objects.resize(count);
+    for (int i = 0; i < count; i++)
+    {
+        objects[i] = proposals[picked[i]];
+
+        // adjust offset to original unpadded
+        float x0 = (objects[i].rect.x) / scale;
+        float y0 = (objects[i].rect.y) / scale;
+        float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
+        float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
+
+        // clip
+        x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+        y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+        x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+        y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+        objects[i].rect.x = x0;
+        objects[i].rect.y = y0;
+        objects[i].rect.width = x1 - x0;
+        objects[i].rect.height = y1 - y0;
+    }
+
+    return 0;
+}
+
+static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
+{
+    static const char* class_names[] = {
+        "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
+        "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
+        "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
+        "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
+        "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+        "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
+        "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+        "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
+        "hair drier", "toothbrush"
+    };
+
+    cv::Mat image = bgr.clone();
+
+    for (size_t i = 0; i < objects.size(); i++)
+    {
+        const Object& obj = objects[i];
+
+        fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+                obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+        cv::rectangle(image, obj.rect, cv::Scalar(255, 0, 0));
+
+        char text[256];
+        sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
+
+        int baseLine = 0;
+        cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+        int x = obj.rect.x;
+        int y = obj.rect.y - label_size.height - baseLine;
+        if (y < 0)
+            y = 0;
+        if (x + label_size.width > image.cols)
+            x = image.cols - label_size.width;
+
+        cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
+                      cv::Scalar(255, 255, 255), -1);
+
+        cv::putText(image, text, cv::Point(x, y + label_size.height),
+                    cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+    }
+
+    cv::imshow("image", image);
+    cv::waitKey(0);
+}
+
+int main(int argc, char** argv)
+{
+    if (argc != 2)
+    {
+        fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
+        return -1;
+    }
+
+    const char* imagepath = argv[1];
+
+    cv::Mat m = cv::imread(imagepath, 1);
+    if (m.empty())
+    {
+        fprintf(stderr, "cv::imread %s failed\n", imagepath);
+        return -1;
+    }
+
+    std::vector<Object> objects;
+    detect_yolox(m, objects);
+
+    draw_objects(m, objects);
+
+    return 0;
+}