The following is the current iteration of a gaussian blur approximation code I am using.
A typical naive convolution operation is O(N*M), where N is the number of image pixels, and M is the number of kernel pixels.
I am using three tricks to avoid this penalty:
- 3 iterations of simple box blurs look very similar to a gaussian blur.
- A box blur kernel can be applied separately to each axis: 1D horizontal blur + 1D vertical blur equals a 2D convolution box blur.
- A box blur kernel consists of equal values, so we can simply add one pixel from the start and subtract one pixel from the tail instead of convolving the entire kernel.
This results in an O(M) operation, independent of kernel size.
On top of this, I am employing both SSE intrinsics and parallel execution to achieve my goal.
This results in impressive performance compared to the naive implementation, but in spite of these tricks, I can not get speed up to the level I'd like. Do you have any more ideas?
For brevity I have included only the horizontal pass. The vertical pass is algorithmically identical.
void FastBlurImageFilter::ProcessHorizontal(const ImagePixelBuffer &source)
{
int size = this->size; //the size of the kernel.
__m128 numPixels = _mm_set1_ps((float)size);
__m128 divisor = _mm_mul_ps(numPixels, _mm_mul_ps(numPixels, numPixels));
int halfSize = size / 2;
__m128 filler = _mm_set1_ps((float)(halfSize));
int w = source.Width() - size;
int h = source.Height();
int sW = source.Width();
//thread-local cache vectors
Concurrency::combinable<std::vector<__m128>> sCache;
Concurrency::combinable<std::vector<__m128>> dCache;
Concurrency::parallel_for(0, h, [&] (int y)
{
float* sRowF = reinterpret_cast<float*>(source.Row(y));
//initialize thread local vectors
std::vector<__m128> s = sCache.local();
if (s.capacity() < sW) s.reserve(sW);
std::vector<__m128> d = dCache.local();
if (d.capacity() < sW) d.reserve(sW);
__m128* sRow = &(s[0]);
__m128* dRow = &(d[0]);
//load the source pixels to a cache vector
for (int x = 0; x < sW; x++, sRowF += 4)
{
sRow[x] = _mm_load_ps(sRowF);
}
//3 iterations to approximate gaussian blur
for (int i = 0; i < 3; ++i)
{
__m128* sPixel = sRow;
__m128* dPixel = dRow;
__m128 firstValue = *sPixel;
__m128 pixel = _mm_mul_ps(firstValue, filler);
__m128* nextStop = sPixel + halfSize;
//process the first pixel of the row
while (sPixel <= nextStop)
{
pixel = _mm_add_ps(pixel, *(sPixel++));
}
*(dPixel++) = pixel;
#define DOPIXEL(vA,vB) { pixel = _mm_add_ps(pixel, _mm_sub_ps(vB, vA)); *(dPixel++) = pixel; }
//process the pixels up until half of the kernel size
nextStop = sPixel + halfSize;
while (sPixel < nextStop)
{
DOPIXEL(firstValue, *(sPixel++));
}
//process the middle pixels (usually the biggest part)
__m128* tailPixel = sPixel - size;
nextStop = sPixel + w;
while (sPixel < nextStop)
{
DOPIXEL(*(tailPixel++), *(sPixel++));
}
//process the last halfKernelSize pixels.
__m128 lastValue = *(sPixel - 1);
nextStop = dPixel + halfSize;
while (dPixel < nextStop)
{
DOPIXEL(*(tailPixel++), lastValue);
}
#undef DOPIXEL
//swap the caches for the next iteration
__m128* temp = sRow;
sRow = dRow;
dRow = temp;
}
//store the pixel values back in the image buffer.
__m128* first = &d[0];
__m128* last = first + sW;
float* final = reinterpret_cast<float*>(source.Row(y));
while (first < last)
{
_mm_store_ps(final, _mm_div_ps(*(first++), divisor));
final += 4;
}
}
);
}
This is my first CodeReview post, so please let me know if there is anything I should add to my quesiton.