Tutorial to Use GPU ORB Extractor Feature#

This tutorial shows how to use GPU orb-extractor feature library API.

The GPU orb-extractor feature library offers thread-safe support for both single and multiple cameras.

This tutorial illustrates GPU orb-extractor feature library usage with OpenCV cv::Mat and cv::Keypoints. It explains employing multiple CPU threads with multiple ORB extractor objects, as well as using a single orb-extractor feature object to handle multiple camera inputs.

The multithread feature provides more flexibility for visual SLAM to call multiple objects of the orb-extractor feature library.

Note

This tutorial can be run both inside and outside a Docker* image. We assume that the liborb-lze-dev Deb package has been installed, and the user has copied the tutorial directory from /opt/intel/orb_lze/samples/ to a user-writable directory.

Prepare the environment:

sudo apt install liborb-lze-dev
cp -r /opt/intel/orb_lze/samples/ ~/orb_lze_samples
cd ~/orb_lze_samples/

main.cpp should be in the directory with following content:

// SPDX-License-Identifier: Apache-2.0
// Copyright (C) 2025 Intel Corporation
#include "orb_extractor.h"
#include "cmd_parser.h"
#include <opencv2/opencv.hpp>
#include <opencv2/features2d.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <fstream>
#include <chrono>
#include <memory>
#include <thread>

using namespace std;

constexpr uint32_t max_num_keypts_ = 2000;
constexpr int num_levels_ = 8;
constexpr int ini_fast_thr_ = 20;
constexpr int min_fast_thr_ = 7;
constexpr float scale_factor_ = 1.2f;

struct All_Images
{
    std::string image_title;
    cv::Mat img;
};

std::vector<All_Images> gl_images;

inline double getTimeStamp()
{
    std::chrono::system_clock::duration d = std::chrono::system_clock::now().time_since_epoch();
    std::chrono::seconds s = std::chrono::duration_cast<std::chrono::seconds>(d);
    return s.count() + (std::chrono::duration_cast<std::chrono::microseconds>(d - s).count()) / 1e6;
}

void extract(int num_cam, const std::string& image_path, const std::string& thread_name, int iterations)
{
    int num_of_cameras = num_cam;
    std::vector<cv::Mat> all_images;
    all_images.resize(num_of_cameras);
    for(int i = 0; i < num_of_cameras; i++)
    {
       all_images[i] = cv::imread(image_path, cv::IMREAD_GRAYSCALE);
    }

    std::vector<std::vector<KeyType>> keypts(num_of_cameras);
    std::vector<MatType> all_descriptors(num_of_cameras);

#ifdef OPENCV_FREE
    Mat2d *images = new Mat2d[num_of_cameras];
    std::vector<MatType> in_image_array;
    for( int i = 0; i < num_of_cameras; i++)
    {
        images[i] = Mat2d(all_images[i].rows, all_images[i].cols, all_images[i].data);
        in_image_array.push_back(images[i]);
    }
    std::vector<MatType> in_image_mask_array;
    std::vector<MatType> descriptor_array;
#else
    const cv::_InputArray in_image_array(all_images);
    const cv::_InputArray in_image_mask_array;
    const cv::_OutputArray descriptor_array(all_descriptors);
#endif

    std::vector<std::vector<float>> mask_rect;

    std::string thread_id = thread_name;

    try
    {
        auto extractor = std::make_shared<orb_extractor>(max_num_keypts_, scale_factor_, num_levels_, ini_fast_thr_, min_fast_thr_, num_of_cameras, mask_rect);
        extractor->set_gpu_kernel_path(ORBLZE_KERNEL_PATH_STRING);

        double total_host_time = 0.0;

        for (int i = 0; i < iterations; i++)
        {
            std::cout << "iteration " << i+1 <<"/" << iterations << "\r";
            std::cout.flush();
            double host_start = getTimeStamp();
              extractor->extract(in_image_array, in_image_mask_array, keypts, descriptor_array);
            double host_end = getTimeStamp();
            double host_time_diff = (host_end - host_start)/(float)iterations;
            total_host_time += host_time_diff;
        }

        std::cout << "\n" << thread_id << ": gpu host time=" << total_host_time*1000.0 << std::endl;
    }
    catch(const std::exception& e)
    {
        std::cout << "\n Exception caught:" << e.what();
        exit(1);
    }
    std::vector<std::vector<cv::KeyPoint>> all_keypts(num_of_cameras);

#ifdef OPENCV_FREE
    for(int i=0; i < num_of_cameras; i++)
    {
        auto& gpu_keypts = keypts.at(i);
        for (int pt=0; pt < gpu_keypts.size(); pt++)
        {
            all_keypts[i].emplace_back(cv::KeyPoint(gpu_keypts[pt].x, gpu_keypts[pt].y,
                        gpu_keypts[pt].size, gpu_keypts[pt].angle, gpu_keypts[pt].response,
                        gpu_keypts[pt].octave, -1));
        }
    }
#else
    for(int i=0; i < num_of_cameras; i++)
    {
        all_keypts.at(i) = keypts.at(i);
    }
#endif

    std::vector<cv::Mat> out;
    out.resize(num_of_cameras);

    thread_id  =  thread_id + "_and_";

    for( int i = 0; i < num_of_cameras; i++)
    {
        out.at(i).create(all_images.at(i).rows, all_images.at(i).cols, CV_8U);
        cv::drawKeypoints(all_images.at(i), all_keypts[i], out[i], cv::Scalar(255,0,0));
        char no[20];
        sprintf(no,"Img:%d",i+1);
        All_Images obj;
        obj.image_title = thread_id + no;
        obj.img = out[i];
        gl_images.push_back(obj);
    }
}

int main(int argc, char** argv)
{
   if(!ParseCommandLine(argc, argv))
   {
       return 0;
   }

   const int num_images = FLAGS_images;
   const int num_of_threads = FLAGS_threads;
   const int num_of_iter = FLAGS_iterations;
   std::string image_path = FLAGS_image_path;

   std::vector<std::thread> threads;

    for (int i = 0; i < num_of_threads; ++i)
    {
        std::string thread_name = "Thread:" + std::to_string(i + 1);
        threads.emplace_back(extract, num_images, image_path.c_str(), thread_name, num_of_iter);
    }
    for (auto& thread : threads)
        thread.join();

    //show the images
    for (int i = 0; i < (num_images * num_of_threads); i++)
    {
        cv::imshow(gl_images[i].image_title, gl_images[i].img);
    }
    cv::waitKey(0);

    return 0;
}

Build the code:

mkdir build && cd build
cmake ../
make -j

Run the binary:

./feature_extract -h

Following are the command line arguments:

Usage: ./feature_extract --images=<> --image_path=<> --threads=<>

  --images <integer>     :  Number of images or number of cameras. Default value: 1
  --image_path <string>  :  Path to input image files. Default value: image.jpg
  --threads <integer>    :  Number of threads to run. Default value: 1
  --iterations <integer> :  Number of iterations to run. Default value: 10

The following command, it will run four threads, each thread is taking two cameras image input.

./feature_extract --images=2 --threads=4

Expected results example:

./feature_extract --images=2 --threads=4
 iteration 10/10
 Thread:2: gpu host time=21.4233
 iteration 10/10
 Thread:1: gpu host time=21.133
 iteration 10/10
 Thread:4: gpu host time=20.9086
 iteration 10/10
 Thread:3: gpu host time=20.6155

After executing, the input image will display keypoints in blue color dots.

Note

Here, you can specify the number of images per thread and the number of threads to be executed. You have the option to process multiple image inputs within a single thread of the extract API or to process a single or more images input using multiple threads with extract API calls.

Code Explanation#

Configuration for the ORB extractor:

using namespace std;

constexpr uint32_t max_num_keypts_ = 2000;
constexpr int num_levels_ = 8;
constexpr int ini_fast_thr_ = 20;

Initialize the input and output parameters:

{
    int num_of_cameras = num_cam;
    std::vector<cv::Mat> all_images;
    all_images.resize(num_of_cameras);
    for(int i = 0; i < num_of_cameras; i++)
    {
       all_images[i] = cv::imread(image_path, cv::IMREAD_GRAYSCALE);
    }

    std::vector<std::vector<KeyType>> keypts(num_of_cameras);
    std::vector<MatType> all_descriptors(num_of_cameras);

#ifdef OPENCV_FREE
    Mat2d *images = new Mat2d[num_of_cameras];
    std::vector<MatType> in_image_array;
    for( int i = 0; i < num_of_cameras; i++)
    {
        images[i] = Mat2d(all_images[i].rows, all_images[i].cols, all_images[i].data);
        in_image_array.push_back(images[i]);
    }
    std::vector<MatType> in_image_mask_array;
    std::vector<MatType> descriptor_array;
#else
    const cv::_InputArray in_image_array(all_images);
    const cv::_InputArray in_image_mask_array;

Create orb_extract object:

try

Set gpu kernel path: Specify the path to GPU binaries such as gaussian_genx.bin, resize_genx.bin.

Note

The macro ORBLZE_KERNEL_PATH_STRING is defined as “/usr/lib/x86_64-linux-gnu” in the header file config.h. This header file is installed by the Deb package liborb-lze-dev at /usr/include/config.h.

Call the extract function to output the keypoints and descriptors for all camera input images. Depending on the number of camera inputs, the orb-extractor feature library returns the number of the keypoints vector and the descriptors vector.

            std::cout.flush();

Draw the keypoints on the image. Keypoints are drawn on the image and stored in respective CV:Mat vector.

#endif

    std::vector<cv::Mat> out;
    out.resize(num_of_cameras);

    thread_id  =  thread_id + "_and_";

    for( int i = 0; i < num_of_cameras; i++)
    {
        out.at(i).create(all_images.at(i).rows, all_images.at(i).cols, CV_8U);
        cv::drawKeypoints(all_images.at(i), all_keypts[i], out[i], cv::Scalar(255,0,0));
        char no[20];
        sprintf(no,"Img:%d",i+1);
        All_Images obj;
        obj.image_title = thread_id + no;
        obj.img = out[i];

Create multiple threads. Each thread will create one orb-extractor feature object.

   std::string image_path = FLAGS_image_path;

   std::vector<std::thread> threads;

    for (int i = 0; i < num_of_threads; ++i)
    {
        std::string thread_name = "Thread:" + std::to_string(i + 1);
        threads.emplace_back(extract, num_images, image_path.c_str(), thread_name, num_of_iter);
    }

Display images:

        thread.join();

    //show the images
    for (int i = 0; i < (num_images * num_of_threads); i++)
    {
        cv::imshow(gl_images[i].image_title, gl_images[i].img);