I have found that a bottleneck of the OpenCV application I use is the bilinear interpolation, so I have tried to optimize it. The bilinear interpolation is in 8D space, so each "color" is an 8 dimensional vector in [0,255].
I have also written a small benchmarking program.
My code is between 0% and 50% faster than the original one.
I would like to know of any other improvements I could make and also I would like to know if I missed any bugs. And is the performance increase general, and not just specific to my computer architecture?
#include <opencv2/opencv.hpp>
#include <cstdlib>
#include <iostream>
#include <array>
#include <time.h>
using std::cout;
using std::endl;
float random()
{
return static_cast<float>(rand()) / RAND_MAX * 10.0f;
}
template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_original( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
cv::Vec<uchar, cn> interpolateColor;
float x2x1, y2y1, x2x, y2y, yy1, xx1;
x2x1 = x2 - x1;
y2y1 = y2 - y1;
x2x = x2 - x;
y2y = y2 - y;
yy1 = y - y1;
xx1 = x - x1;
const float k = 1.f / (x2x1 * y2y1);
//calculate the interpolation for all the chanes
for (int i =0; i < cn; i++)
{
float interpolation = k * (
q11[i] * x2x * y2y +
q21[i] * xx1 * y2y +
q12[i] * x2x * yy1 +
q22[i] * xx1 * yy1
);
interpolateColor[i] = cvRound(interpolation);
}
return interpolateColor;
}
#include <immintrin.h>
#include <xmmintrin.h>
/**
* Convert 8x32bits values into 8x8bits values. The last 64 bits are zero-ed.
* If values are greather than 255, only low bytes are kept.
**/
__m128i _mm256_convert_epi32_epi8(__m256i _a)
{
// Convert from 'int32' to 'int8'
// Digit = Sample value with one byte length:
// FROM: 1000|2000|3000|4000||||5000|6000|7000|8000 (BIG ENDIAN)
// TO: 1234|0000|0000|0000||||5678|0000|0000|0000
static const __m256i _mask = []() {
std::array<uchar, 32> mask;
std::fill(mask.begin(), mask.end(), -1);
for (int i = 0; i < 4; i++) {
mask[ i] = 4 * i;
mask[16+i] = 4 * i;
}
return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(mask.data()));
}();
const __m256i _a_shuffled = _mm256_shuffle_epi8(_a, _mask);
const __m128i _low = _mm256_extractf128_si256(_a_shuffled, 0);
__m128i _high = _mm256_extractf128_si256(_a_shuffled, 1);
_high = _mm_slli_epi64(_high, 32);
const __m128i _or = _mm_or_si128(_low, _high);
return _or;
}
template <int cn>
cv::Vec<uchar, cn> bilinearInterpolation_mat( cv::Vec<uchar,cn> q11, cv::Vec<uchar,cn> q12, cv::Vec<uchar,cn> q21, cv::Vec<uchar,cn> q22, const float x1, const float x2, const float y1, const float y2, const float x, const float y)
{
alignas(16) std::array<float, 8> C;
const __m256 A_vec = _mm256_set_ps(0, 0, x, y, y2, x2, y2, x2); // Arguments are in reverse
const __m256 B_vec = _mm256_set_ps(0, 0, x1, y1, y, x, y1, x1);
__m256 Sub = _mm256_sub_ps(A_vec, B_vec);
_mm256_store_ps(C.data(), Sub);
const float& x2x1 = C[0];
const float& y2y1 = C[1];
const float& x2x = C[2];
const float& y2y = C[3];
const float& yy1 = C[4];
const float& xx1 = C[5];
// Load each register as an array of 8x the same float32 value which is ks[i]
const __m256 _k11 = _mm256_set1_ps(x2x * y2y);
const __m256 _k21 = _mm256_set1_ps(xx1 * y2y);
const __m256 _k12 = _mm256_set1_ps(x2x * yy1);
const __m256 _k22 = _mm256_set1_ps(xx1 * yy1);
const __m256 _k = _mm256_set1_ps(1.0f / (x2x1 * y2y1)); // Global multiplier coefficient
// Load 8 x 'uint8' (8 * 8 = 64bits) into the 64 low-bits of the register
__m128i _q11_uint8 = _mm_loadu_si64(q11.val);
// Convert from 'uint8' to 'int32'
// Because conversions from integral types to floating types only work with 'int32'
__m256i _q11_int32 = _mm256_cvtepu8_epi32(_q11_uint8);
// Convert from 'int32' to 'float'
__m256 _q11_float32 = _mm256_cvtepi32_ps(_q11_int32);
// Same with q21, q12, and q22
// ######
__m128i _q21_uint8 = _mm_loadu_si64(q21.val);
__m256i _q21_int32 = _mm256_cvtepu8_epi32(_q21_uint8);
__m256 _q21_float32 = _mm256_cvtepi32_ps(_q21_int32);
__m128i _q12_uint8 = _mm_loadu_si64(q12.val);
__m256i _q12_int32 = _mm256_cvtepu8_epi32(_q12_uint8);
__m256 _q12_float32 = _mm256_cvtepi32_ps(_q12_int32);
__m128i _q22_uint8 = _mm_loadu_si64(q22.val);
__m256i _q22_int32 = _mm256_cvtepu8_epi32(_q22_uint8);
__m256 _q22_float32 = _mm256_cvtepi32_ps(_q22_int32);
// ######
__m256 _res = {};
_res = _mm256_fmadd_ps(_k11, _q11_float32, _res);
_res = _mm256_fmadd_ps(_k12, _q12_float32, _res);
_res = _mm256_fmadd_ps(_k21, _q21_float32, _res);
_res = _mm256_fmadd_ps(_k22, _q22_float32, _res);
_res = _mm256_mul_ps(_res, _k);
// Round to nearest int, suppress exceptions
_res = _mm256_round_ps(_res, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
// Convert back from 'float32' to 'int32'
__m256i _resi = _mm256_cvtps_epi32(_res);
// Convert back from 'int32' to 'uint8'
__m128i _trunc = _mm256_convert_epi32_epi8(_resi);
// Convert the packed 'uint8' to cv::Vec
cv::Vec<uchar, cn> interpolateColor;
_mm_storel_epi64(reinterpret_cast<__m128i*>(interpolateColor.val), _trunc);
return interpolateColor;
}
struct TestData
{
cv::Vec<uchar, 8> q11;
cv::Vec<uchar, 8> q12;
cv::Vec<uchar, 8> q21;
cv::Vec<uchar, 8> q22;
float x1, x2, y1, y2, x, y;
};
const int N = 10'000'000;
std::vector<TestData> testData;
template<typename T>
void test(const char* title, T func)
{
clock_t start, end;
double cpu_time_used;
start = clock();
cv::Vec<uchar, 8> res;
int aa(0);
for(int i = 0; i < N; i++)
{
auto& d = testData[i];
res = func(d.q11, d.q12, d.q21, d.q22, d.x1, d.x2, d.y1, d.y2, d.x, d.y);
#if PRINT
for (int i = 0; i < 8; i++)
{
std::cout << (int)res[i] << ", ";
}
std::cout << std::endl;
#endif
for(int i = 0; i < 8; i++) aa += res[i];
}
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
std::cout << aa << std::endl;
std::cout << title << ": " << cpu_time_used << "s" << std::endl;
}
void test()
{
}
int main()
{
test();
testData.reserve(N);
for(int i = 0; i < N; i++)
{
TestData data;
cv::randu(data.q11, 0, 255);
cv::randu(data.q12, 0, 255);
cv::randu(data.q21, 0, 255);
cv::randu(data.q22, 0, 255);
data.x1 = random();
data.x2 = random();
data.y1 = random();
data.y2 = random();
data.x = random();
data.y = random();
testData.push_back(data);
}
test("original", bilinearInterpolation_original<8>);
test("mat", bilinearInterpolation_mat<8>);
return EXIT_SUCCESS;
}
_mm256_set_pswhich looks cheap but it's really not, it's quite expensive), and merge some of the packing that happens in the end. Also, is 16-bit fixed-point arithmetic OK? \$\endgroup\$xoryetc, in a way that part of those calculations could be shared. Ideally the data format would be more SoA-like, or AoSoA, but even aTestData*could be worked with. \$\endgroup\$for each pixel(x, y) { long list of sequential actions including bilinear interpolation at one moment }. I don't have written the code, but I have to optimize it. \$\endgroup\$