Bilinear interpolation optimized using intrinsics

Question

I have found that a bottleneck of the OpenCV application I use is the bilinear interpolation, so I have tried to optimize it. The bilinear interpolation is in 8D space, so each "color" is an 8 dimensional vector in [0,255].

I have also written a small benchmarking program.

My code is between 0% and 50% faster than the original one.

I would like to know of any other improvements I could make and also I would like to know if I missed any bugs. And is the performance increase general, and not just specific to my computer architecture?

#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
     
using std::cout;
using std::endl;

float random()
{
    return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    cv::Vec<uchar, cn> interpolateColor;
    float x2x1, y2y1, x2x, y2y, yy1, xx1;
    x2x1 = x2 - x1;
    y2y1 = y2 - y1;
    x2x = x2 - x;
    y2y = y2 - y;
    yy1 = y - y1;
    xx1 = x - x1;
    
    const float k = 1.f / (x2x1 * y2y1);

    //calculate the interpolation for all the chanes
    for (int i =0; i < cn; i++)
    {
        float interpolation = k * (
            q11[i] * x2x * y2y +
            q21[i] * xx1 * y2y +
            q12[i] * x2x * yy1 +
            q22[i] * xx1 * yy1
            );
        interpolateColor[i] = cvRound(interpolation);
    }
    return interpolateColor;
}

#include <immintrin.h>
#include <xmmintrin.h>

/**
    * Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
    * If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
    // Convert from 'int32' to 'int8'
    // Digit = Sample value with one byte length:
    // FROM: 1000|2000|3000|4000||||5000|6000|7000|8000     (BIG ENDIAN)
    // TO:   1234|0000|0000|0000||||5678|0000|0000|0000
    static const __m256i _mask = []() {
        std::array<uchar, 32> mask;
        std::fill(mask.begin(), mask.end(), -1);
        for (int i = 0; i < 4; i++) {
            mask[   i] = 4 * i;
            mask[16+i] = 4 * i;
        }

        return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
    }();

    const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);

    const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
    __m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
    _high = _mm_slli_epi64(_high, 32);
    const __m128i _or = _mm_or_si128(_low, _high);

    return _or;
}

template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    alignas(16) std::array<float, 8> C;

    const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
    const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
    __m256 Sub = _mm256_sub_ps(A_vec, B_vec);
    _mm256_store_ps(C.data(), Sub);
    
    const float& x2x1 = C[0];
    const float& y2y1 = C[1];
    const float& x2x = C[2];
    const float& y2y = C[3];
    const float& yy1 = C[4];
    const float& xx1 = C[5];
   
    // Load each register as an array of 8x the same float32 value which is ks[i]
    const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
    const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
    const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
    const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
    const __m256 _k   = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient

    // Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
    __m128i _q11_uint8 = _mm_loadu_si64(q11.val);

    // Convert from 'uint8' to 'int32'
    // Because conversions from integral types to floating types only work with 'int32'
    __m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);

    // Convert from 'int32' to 'float'
    __m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);

    // Same with q21, q12, and q22
    // ######
    __m128i _q21_uint8 = _mm_loadu_si64(q21.val);
    __m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
    __m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);

    __m128i _q12_uint8 = _mm_loadu_si64(q12.val);
    __m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
    __m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);

    __m128i _q22_uint8 = _mm_loadu_si64(q22.val);
    __m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
    __m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
    // ######

    __m256 _res = {};
    _res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
    _res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
    _res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
    _res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
    _res = _mm256_mul_ps(_res, _k);

    // Round to nearest int, suppress exceptions
    _res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));

    // Convert back from 'float32' to 'int32'
    __m256i _resi = _mm256_cvtps_epi32(_res);

    // Convert back from 'int32' to 'uint8'
    __m128i _trunc = _mm256_convert_epi32_epi8(_resi);

    // Convert the packed 'uint8' to cv::Vec
    cv::Vec<uchar, cn> interpolateColor;
    _mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
    return interpolateColor;
}

struct TestData
{
    cv::Vec<uchar, 8> q11;
    cv::Vec<uchar, 8> q12;
    cv::Vec<uchar, 8> q21;
    cv::Vec<uchar, 8> q22;
    float x1, x2, y1, y2, x, y;
};

const int N = 10'000'000;
std::vector<TestData> testData;

template<typename T>
void test(const char* title, T func)
{
    clock_t start, end;
    double cpu_time_used;
    start = clock();

    cv::Vec<uchar, 8> res;
    int aa(0);

    for(int i = 0; i < N; i++)
    {
        auto& d = testData[i];
        res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);

#if PRINT
        for (int i = 0; i < 8; i++)
        {
            std::cout << (int)res[i] << ", ";
        }
        std::cout << std::endl;
#endif

        for(int i = 0; i < 8; i++) aa += res[i];
    }

    end = clock();
    cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;

    std::cout << aa << std::endl;
    std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}


void test()
{
}

int  main()
{
    test();

    testData.reserve(N);
    for(int i = 0; i < N; i++)
    {
        TestData data;
        cv::randu(data.q11, 0, 255);
        cv::randu(data.q12, 0, 255);
        cv::randu(data.q21, 0, 255);
        cv::randu(data.q22, 0, 255);
        data.x1 = random();
        data.x2 = random();
        data.y1 = random();
        data.y2 = random();
        data.x = random();
        data.y = random();
        testData.push_back(data);
    }

    test("original", bilinearInterpolation_original<8>);
    test("mat", bilinearInterpolation_mat<8>);

    return EXIT_SUCCESS;
}

What is the expected usage? This kind of API that handles one calculation sped up using SIMD is extremely limiting in what the SIMD can do. It the function had access to multiple pixels, it could hopefully reuse some of the expensive up-front calculations (including _mm256_set_ps which looks cheap but it's really not, it's quite expensive), and merge some of the packing that happens in the end. Also, is 16-bit fixed-point arithmetic OK? — user555045
– user555045, Commented Oct 26, 2022 at 5:23
@harold It is for an algorithm for texture generation. As the vectors are 8D, could I really treat multiple pixels at the same time ? I just forget the function is templated but the bottleneck is the 8D one. About 16-bits fixed point arithmetic : I don't know, it depends on the precision and the visual final results, but I think it is possible. The final result is rounded, so I presume something like 8.8 calculations is possible. — rafoo
– rafoo, Commented Oct 26, 2022 at 5:31
Well, I don't know how plausible it is to do multiple pixels at the time, it depends on things that happen outside of the code that you've shown so far. Just based on what I'm seeing here, there is no immediate obstacle yet.. you would know that better than any of us. Hopefully there would be multiple pixels that share the same x or y etc, in a way that part of those calculations could be shared. Ideally the data format would be more SoA-like, or AoSoA, but even a TestData* could be worked with. — user555045
– user555045, Commented Oct 26, 2022 at 5:47
@harold Yes, I have to investigate that; optimizing this function directly was the simplest I could do. The code is a bit complicated and I don't understand everything yet, but the grand algorithm of the application is for each pixel(x, y) { long list of sequential actions including bilinear interpolation at one moment }. I don't have written the code, but I have to optimize it. — rafoo
– rafoo, Commented Oct 26, 2022 at 5:52
@harold What I mean is that each computation use already float32 x 8, so I don't see how treating multiple pixels will increase performance? eg. computing 2 pixels at the same time is not possible, it needs float32 x 16 (512 bits, not possible if we ignore AVX512), so instead of splitting pixels if would split color space. It would compute the first half dimension then the second half. For me its a matter of which direction to parallelize, or am I missing something? — rafoo
– rafoo, Commented Oct 26, 2022 at 20:58

G. Sliepen · Accepted Answer · 2022-11-26 21:48:55Z

Use a SIMD library

While using intrinsics is vastly better than using (inline) assembly, there are still lots of drawbacks: they are not portable to other architectures, and the code is not looking very pretty. You have to do a lot of things manually, like ensuring the alignment of data and converting from OpenCV types to the intrinsic vector types.

Luckily, there are libraries that are built on top of intrinsics that give you a nice and clean C++ experience. For example, Highway is one such library.

If you don't or can't use such a library, then consider writing your own types and functions that abstract away the platform-specific details. Ideally, the vectorized code should look like the non-vectorized code as much as possible, except instead of needing a loop to do the calculations per element of the vector, the operations are done on a whole vector at a time.

Use template specialization for code that only works on one vector size

The function bilinearInterpolation_mat() is templated on the size of the vector. However, the vectorized version only works correctly on vectors of size 8. In that case, make it a specialization of a template:

// Declaration of the template
template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat(cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y);

// Specialization for size 8
template <>
cv::Vec<uchar, 8> bilinearInterpolation_mat(cv::Vec<uchar, 8> q11, cv::Vec<uchar, 8> q12, cv::Vec<uchar, 8> q21, cv::Vec<uchar, 8> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
    ...
}

Note that you can also have a generic version that is non-vectorized, and then have one or more vectorized specializations for specific vector sizes.

Toby Speight · Accepted Answer · 2022-11-27 08:42:19Z

1

float random()

This is problematic, as my platform has a declaration of POSIX int random() in <cstdlib>, which prevents compilation. It's better to put such functions in a suitable namespace. Or we could even hide it inside main():

int main()
{
    auto const random = []{
        return static_cast<float>(std::rand() * (10.0 / RAND_MAX));
    };

These aliases appear unused:

using std::cout;
using std::endl;

There's a lot of unnecessary flushing (std::endl), where plain newline is sufficient.

It's probably better to use std::chrono::steady_clock for timing code, rather than std::clock(). We might even want to look at counting cycles, on CPUs with performance registers.

edited Nov 27, 2022 at 8:42

answered Oct 26, 2022 at 7:52

Toby Speight

88.7k14 gold badges104 silver badges327 bronze badges

\$\begingroup\$ I get a bit further if I move the call outside the lambda: static const __m256i _mask = _mm256_loadu_si256([]{ ⋯ return reinterpret_cast<const __m256i*>(mask.data()); }());. But then I get similar inlining error for _mm256_shuffle_epi8(_a, _mask). \$\endgroup\$

Toby Speight
– Toby Speight

2022-10-27 07:15:37 +00:00
Commented Oct 27, 2022 at 7:15
\$\begingroup\$ I got access to a system with Skylake processor, and it builds there, under GCC 10, so I guess the function is only inlineable if it's directly supported by the target. Ah, and -march=skylake gets me building locally, although -march=haswell doesn't. Hmm. \$\endgroup\$

Toby Speight
– Toby Speight

2022-10-27 07:18:22 +00:00
Commented Oct 27, 2022 at 7:18
\$\begingroup\$ On the Skylake machine, output (compiled with -O3) is 1605913439 original: 0.243997s 766172929 mat: 0.223998s , i.e. 8% speed-up. \$\endgroup\$

Toby Speight
– Toby Speight

2022-10-27 07:22:56 +00:00
Commented Oct 27, 2022 at 7:22
\$\begingroup\$ If its written 1605913439 and 766172929, then there is a bug, because the results are different between the intrinsic / non-intrisic version... @Toby Speight \$\endgroup\$

rafoo
– rafoo

2022-11-13 08:26:39 +00:00
Commented Nov 13, 2022 at 8:26
1

\$\begingroup\$ Thanks @G.Sliepen. What I meant was that although we appear to be using only the C Standard Library, we can get other functions as well. \$\endgroup\$

Toby Speight
– Toby Speight

2022-11-27 08:41:06 +00:00
Commented Nov 27, 2022 at 8:41

Add a comment |

Stack Exchange Network

Bilinear interpolation optimized using intrinsics

2 Answers 2

Use a SIMD library

Use template specialization for code that only works on one vector size

You must log in to answer this question.

Hot Network Questions

Bilinear interpolation optimized using intrinsics

2 Answers 2

Use a SIMD library

Use template specialization for code that only works on one vector size

You must log in to answer this question.

Related

Hot Network Questions