Skip to main content
Bumped by Community user
Spelling and grammar
Source Link
Toby Speight
  • 88.7k
  • 14
  • 104
  • 327

I have found that a bottleneck of the OpenCV application I use is the bilinear interpolation, so I have tried to optimize it. The bilinear interpolation is in 8D space, so each "color" is an 8 dimensional vector in [0;255][0,255].

I have also written a small benchmarking program. After long harbor, finally I achieved it! It

My code is even faster sometimes betweenbetween 0% and 50% of improvementfaster than the original one. I would know if there is others improvements possible too, and also I would like to know if there is any bugs. And if hopefelly, if the time improvements is not by pure chance on my computer architecture.

I would like to know of any other improvements I could make and also I would like to know if I missed any bugs. And is the performance increase general, and not just specific to my computer architecture?

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 10'000'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

```

I have found that a bottleneck of the OpenCV application I use is the bilinear interpolation, so I have tried to optimize it. The bilinear interpolation is in 8D space, so each "color" is an 8 dimensional vector in [0;255].

I have also written a small benchmarking program. After long harbor, finally I achieved it! It is even faster sometimes between 0% and 50% of improvement than the original one. I would know if there is others improvements possible too, and also I would like to know if there is any bugs. And if hopefelly, if the time improvements is not by pure chance on my computer architecture.

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 10'000'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

```

I have found that a bottleneck of the OpenCV application I use is the bilinear interpolation, so I have tried to optimize it. The bilinear interpolation is in 8D space, so each "color" is an 8 dimensional vector in [0,255].

I have also written a small benchmarking program.

My code is between 0% and 50% faster than the original one.

I would like to know of any other improvements I could make and also I would like to know if I missed any bugs. And is the performance increase general, and not just specific to my computer architecture?

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 10'000'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

deleted 2 characters in body
Source Link
rafoo
  • 335
  • 2
  • 7

I have also written a small benchmarking program. After long harbor, finally I achieved it! It is even faster around x2 timessometimes between 0% and 50% of improvement than the original one. I would know if there is others improvements possible too, and also I would like to know if there is any bugs. And if hopefelly, if the time improvements is not by pure chance on my computer architecture.

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 100'000;10'000'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int c = 0; c < 100; c++)
    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

```

I have also written a small benchmarking program. After long harbor, finally I achieved it! It is even faster around x2 times than the original one. I would know if there is others improvements possible too, and also I would like to know if there is any bugs. And if hopefelly, if the time improvements is not by pure chance on my computer architecture.

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 100'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int c = 0; c < 100; c++)
    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

```

I have also written a small benchmarking program. After long harbor, finally I achieved it! It is even faster sometimes between 0% and 50% of improvement than the original one. I would know if there is others improvements possible too, and also I would like to know if there is any bugs. And if hopefelly, if the time improvements is not by pure chance on my computer architecture.

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 10'000'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

```
Source Link
rafoo
  • 335
  • 2
  • 7

Bilinear interpolation optimized using intrinsics

I have found that a bottleneck of the OpenCV application I use is the bilinear interpolation, so I have tried to optimize it. The bilinear interpolation is in 8D space, so each "color" is an 8 dimensional vector in [0;255].

I have also written a small benchmarking program. After long harbor, finally I achieved it! It is even faster around x2 times than the original one. I would know if there is others improvements possible too, and also I would like to know if there is any bugs. And if hopefelly, if the time improvements is not by pure chance on my computer architecture.

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 100'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int c = 0; c < 100; c++)
    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

```