Skip to main content
Bumped by Community user
Updated code.
Source Link
Royi
  • 582
  • 6
  • 21

The code (C Code, compiles with C99) is given by (See code on Compiler Explorer):

#define _USE_MATH_DEFINES

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <omp.h>

#define OFF 0
#define ON  1

#include <immintrin.h> // AVX

#define SSE_STRIDE 4
#define SSE_ALIGNMENT 16

#define AVX_STRIDE 8
#define AVX_ALIGNMENT 32


#define M_PIf (float)(M_PI)

void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor);
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);


void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
    int ii, paramN;
    float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;

    mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
    mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
    mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);

    memset(mZ, 0.0f, numRows * numCols * sizeof(float));

    // Init Parameters

    paramL      = paramK * rangeStd;
    paramTau    = paramK / M_PIf;
    paramN      = ceilf((paramK * paramK) / M_PIf);
    paramOmega  = M_PIf / paramL;

    vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);

    for (ii = 1; ii <= paramN; ii++)
    {
        vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
    }

    InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);

    // Iteration Number 1
    ii = 1;

    ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    // Iteration Number 2
    ii = 2;
    InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

    ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    for (ii = 3; ii <= paramN; ii++)
    {
        UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

        ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
        ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

        UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
    }

    UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);

    _mm_free(mZ);
    _mm_free(mT);
    _mm_free(mC);
    _mm_free(mS);
    _mm_free(mCOmega);
    _mm_free(mSOmega);
    _mm_free(mCFiltered);
    _mm_free(mSFiltered);
    _mm_free(vParamD);

}

// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {

    int ii;


    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mCOmega[ii] = cosf(paramOmega * mI[ii]);
        mSOmega[ii] = sinf(paramOmega * mI[ii]);
    }

}


void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
        mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
    }


}


void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
        mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
    }


}


void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;
    float varTmp;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
        mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
        mS[ii] = varTmp;
    }


}

 
void UpdtaeOutputUpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {

    int ii;
    float outFactor;

    outFactor = (M_PIf * rangeStd * rangeStd) / paramL;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
    }


}

The code (C Code, compiles with C99) is given by:

#define _USE_MATH_DEFINES

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <omp.h>

#define OFF 0
#define ON  1

#include <immintrin.h> // AVX

#define SSE_STRIDE 4
#define SSE_ALIGNMENT 16

#define AVX_STRIDE 8
#define AVX_ALIGNMENT 32


#define M_PIf (float)(M_PI)

void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor);
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);


void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
    int ii, paramN;
    float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;

    mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
    mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
    mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);

    memset(mZ, 0.0f, numRows * numCols * sizeof(float));

    // Init Parameters

    paramL      = paramK * rangeStd;
    paramTau    = paramK / M_PIf;
    paramN      = ceilf((paramK * paramK) / M_PIf);
    paramOmega  = M_PIf / paramL;

    vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);

    for (ii = 1; ii <= paramN; ii++)
    {
        vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
    }

    InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);

    // Iteration Number 1
    ii = 1;

    ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    // Iteration Number 2
    ii = 2;
    InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

    ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    for (ii = 3; ii <= paramN; ii++)
    {
        UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

        ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
        ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

        UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
    }

    UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);

    _mm_free(mZ);
    _mm_free(mT);
    _mm_free(mC);
    _mm_free(mS);
    _mm_free(mCOmega);
    _mm_free(mSOmega);
    _mm_free(mCFiltered);
    _mm_free(mSFiltered);
    _mm_free(vParamD);

}

// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {

    int ii;


    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mCOmega[ii] = cosf(paramOmega * mI[ii]);
        mSOmega[ii] = sinf(paramOmega * mI[ii]);
    }

}


void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
        mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
    }


}


void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
        mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
    }


}


void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;
    float varTmp;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
        mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
        mS[ii] = varTmp;
    }


}

void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {

    int ii;
    float outFactor;

    outFactor = (M_PIf * rangeStd * rangeStd) / paramL;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
    }


}

The code (C Code, compiles with C99) is given by (See code on Compiler Explorer):

#define _USE_MATH_DEFINES

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <omp.h>

#define OFF 0
#define ON  1

#include <immintrin.h> // AVX

#define SSE_STRIDE 4
#define SSE_ALIGNMENT 16

#define AVX_STRIDE 8
#define AVX_ALIGNMENT 32


#define M_PIf (float)(M_PI)

void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor);
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);


void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
    int ii, paramN;
    float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;

    mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
    mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
    mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);

    memset(mZ, 0.0f, numRows * numCols * sizeof(float));

    // Init Parameters

    paramL      = paramK * rangeStd;
    paramTau    = paramK / M_PIf;
    paramN      = ceilf((paramK * paramK) / M_PIf);
    paramOmega  = M_PIf / paramL;

    vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);

    for (ii = 1; ii <= paramN; ii++)
    {
        vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
    }

    InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);

    // Iteration Number 1
    ii = 1;

    ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    // Iteration Number 2
    ii = 2;
    InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

    ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    for (ii = 3; ii <= paramN; ii++)
    {
        UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

        ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
        ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

        UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
    }

    UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);

    _mm_free(mZ);
    _mm_free(mT);
    _mm_free(mC);
    _mm_free(mS);
    _mm_free(mCOmega);
    _mm_free(mSOmega);
    _mm_free(mCFiltered);
    _mm_free(mSFiltered);
    _mm_free(vParamD);

}

// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {

    int ii;


    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mCOmega[ii] = cosf(paramOmega * mI[ii]);
        mSOmega[ii] = sinf(paramOmega * mI[ii]);
    }

}


void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
        mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
    }


}


void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
        mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
    }


}


void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;
    float varTmp;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
        mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
        mS[ii] = varTmp;
    }


}

 
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {

    int ii;
    float outFactor;

    outFactor = (M_PIf * rangeStd * rangeStd) / paramL;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
    }


}
added 910 characters in body
Source Link
Royi
  • 582
  • 6
  • 21
#define M_PIf_USE_MATH_DEFINES

#include (float)(M_PI)<stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <omp.h>

#define OFF 0
#define ON  1

#include <immintrin.h> // AVX

#define SSE_STRIDE 4
#define SSE_ALIGNMENT 16

#define AVX_STRIDE 8
#define AVX_ALIGNMENT 32


#define M_PIf (float)(M_PI)

void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor);
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);


void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
    int ii, paramN;
    float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;

    mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
    mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
    mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);

    memset(mZ, 0.0f, numRows * numCols * sizeof(float));

    // Init Parameters

    paramL      = paramK * rangeStd;
    paramTau    = paramK / M_PIf;
    paramN      = ceilf((paramK * paramK) / M_PIf);
    paramOmega  = M_PIf / paramL;

    vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);

    for (ii = 1; ii <= paramN; ii++)
    {
        vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
    }

    InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);

    // Iteration Number 1
    ii = 1;

    ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);
     
    UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    // Iteration Number 2
    ii = 2;
    InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

    ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    for (ii = 3; ii <= paramN; ii++)
    {
        UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

        ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
        ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

        UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
    }

    UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);

    _mm_free(mZ);
    _mm_free(mT);
    _mm_free(mC);
    _mm_free(mS);
    _mm_free(mCOmega);
    _mm_free(mSOmega);
    _mm_free(mCFiltered);
    _mm_free(mSFiltered);
    _mm_free(vParamD);

}

// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {

    int ii;


    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mCOmega[ii] = cosf(paramOmega * mI[ii]);
        mSOmega[ii] = sinf(paramOmega * mI[ii]);
    }

}


void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {
    
    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
        mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
    }


}


void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {
    
    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
        mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
    }


}


void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;
    float varTmp;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
        mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
        mS[ii] = varTmp;
    }


}

void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {

    int ii;
    float outFactor;

    outFactor = (M_PIf * rangeStd * rangeStd) / paramL;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
    }


}
#define M_PIf (float)(M_PI)

void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);


void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
    int ii, paramN;
    float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;

    mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
    mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
    mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);

    memset(mZ, 0.0f, numRows * numCols * sizeof(float));

    // Init Parameters

    paramL      = paramK * rangeStd;
    paramTau    = paramK / M_PIf;
    paramN      = ceilf((paramK * paramK) / M_PIf);
    paramOmega  = M_PIf / paramL;

    vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);

    for (ii = 1; ii <= paramN; ii++)
    {
        vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
    }

    InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);

    // Iteration Number 1
    ii = 1;

    ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);
     
    UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    // Iteration Number 2
    ii = 2;
    InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

    ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    for (ii = 3; ii <= paramN; ii++)
    {
        UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

        ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
        ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

        UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
    }

    UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);

    _mm_free(mZ);
    _mm_free(mT);
    _mm_free(mC);
    _mm_free(mS);
    _mm_free(mCOmega);
    _mm_free(mSOmega);
    _mm_free(mCFiltered);
    _mm_free(mSFiltered);
    _mm_free(vParamD);

}

// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {

    int ii;


    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mCOmega[ii] = cosf(paramOmega * mI[ii]);
        mSOmega[ii] = sinf(paramOmega * mI[ii]);
    }

}


void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {
    
    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
        mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
    }


}


void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {
    
    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
        mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
    }


}


void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;
    float varTmp;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
        mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
        mS[ii] = varTmp;
    }


}

void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
    }


}
#define _USE_MATH_DEFINES

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <memory.h>
#include <omp.h>

#define OFF 0
#define ON  1

#include <immintrin.h> // AVX

#define SSE_STRIDE 4
#define SSE_ALIGNMENT 16

#define AVX_STRIDE 8
#define AVX_ALIGNMENT 32


#define M_PIf (float)(M_PI)

void ImageConvolutionGaussianKernel(float* mO, float* mI, float* mTmp, int numRows, int numCols, float gaussianStd, int stdToRadiusFactor);
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega);
void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD);
void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols);
void UpdateOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL);


void BilateralFilterFastCompressive(float* mO, float* mI, int numRows, int numCols, float spatialStd, float rangeStd, int paramK)
{
    int ii, paramN;
    float paramL, paramTau, *vParamD, *mZ, *mT, paramOmega, *mCOmega, *mSOmega, *mC, *mS, *mCFiltered, *mSFiltered;

    mZ = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Should be initialized to Zero
    mT = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT); // Buffer
    mC = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mS = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSOmega = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mCFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);
    mSFiltered = (float*)_mm_malloc(numRows * numCols * sizeof(float), AVX_ALIGNMENT);

    memset(mZ, 0.0f, numRows * numCols * sizeof(float));

    // Init Parameters

    paramL      = paramK * rangeStd;
    paramTau    = paramK / M_PIf;
    paramN      = ceilf((paramK * paramK) / M_PIf);
    paramOmega  = M_PIf / paramL;

    vParamD = (float*)_mm_malloc(paramN * sizeof(float), AVX_ALIGNMENT);

    for (ii = 1; ii <= paramN; ii++)
    {
        vParamD[ii - 1] = 2 * expf(-(ii * ii) / (2 * paramTau * paramTau));
    }

    InitOmegaArrays(mCOmega, mSOmega, mI, numRows, numCols, paramOmega);

    // Iteration Number 1
    ii = 1;

    ImageConvolutionGaussianKernel(mCFiltered, mCOmega, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mSOmega, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mCOmega, mSOmega, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    // Iteration Number 2
    ii = 2;
    InitArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

    ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
    ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

    UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);

    for (ii = 3; ii <= paramN; ii++)
    {
        UpdateArraysSC(mC, mS, mCOmega, mSOmega, numRows, numCols);

        ImageConvolutionGaussianKernel(mCFiltered, mC, mT, numRows, numCols, spatialStd, paramK);
        ImageConvolutionGaussianKernel(mSFiltered, mS, mT, numRows, numCols, spatialStd, paramK);

        UpdateArrays(mO, mZ, mC, mS, mCFiltered, mSFiltered, numRows, numCols, ii, vParamD[ii - 1]);
    }

    UpdateOutput(mO, mZ, mI, numRows, numCols, rangeStd, paramL);

    _mm_free(mZ);
    _mm_free(mT);
    _mm_free(mC);
    _mm_free(mS);
    _mm_free(mCOmega);
    _mm_free(mSOmega);
    _mm_free(mCFiltered);
    _mm_free(mSFiltered);
    _mm_free(vParamD);

}

// Auxiliary Functions
void InitOmegaArrays(float* mCOmega, float* mSOmega, float* mI, int numRows, int numCols, float paramOmega) {

    int ii;


    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mCOmega[ii] = cosf(paramOmega * mI[ii]);
        mSOmega[ii] = sinf(paramOmega * mI[ii]);
    }

}


void UpdateArrays(float* mO, float* mZ, float* mC, float* mS, float* mCFiltered, float* mSFiltered, int numRows, int numCols, int iterationIdx, float paramD) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] += (iterationIdx * paramD) * (mC[ii] * mSFiltered[ii] - mS[ii] * mCFiltered[ii]);
        mZ[ii] += paramD * (mC[ii] * mCFiltered[ii] + mS[ii] * mSFiltered[ii]);
    }


}


void InitArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mS[ii] = 2.0f * mCOmega[ii] * mSOmega[ii];
        mC[ii] = 2.0f * mCOmega[ii] * mCOmega[ii] - 1.0f;
    }


}


void UpdateArraysSC(float* mC, float* mS, float* mCOmega, float* mSOmega, int numRows, int numCols) {

    int ii;
    float varTmp;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        varTmp = mC[ii] * mSOmega[ii] + mS[ii] * mCOmega[ii];
        mC[ii] = mC[ii] * mCOmega[ii] - mS[ii] * mSOmega[ii];
        mS[ii] = varTmp;
    }


}

void UpdtaeOutput(float* mO, float* mZ, float* mI, int numRows, int numCols, float rangeStd, float paramL) {

    int ii;
    float outFactor;

    outFactor = (M_PIf * rangeStd * rangeStd) / paramL;

    for (ii = 0; ii < numRows * numCols; ii++)
    {
        mO[ii] = mI[ii] + (outFactor * (mO[ii] / (1.0f + mZ[ii])));
    }


}
added 85 characters in body
Source Link
Royi
  • 582
  • 6
  • 21
  1. Writing all small function using AVX2 SIMD intrinsics. Yet it seems I gain only 3-5% over the compiler (I use Intel Compiler).
  2. Using the #pragma vector aligned to force compiler to vectorize the code and assume no aliasing and no alignment issues. It yields results which are 3-5% slower than the hand tuned I did (See 1).
  3. Disabling OpenMP (Didn't improve).
  4. Various compilers (MSVC, GCC 7.3 / 8.1, Intel Compiler 2018 [Was best]). Results above are the best achieved (Intel Compiler). I tried -Ofast and -O3 on GCC. Using O3 on Intel. FP Precision is set to fast.
  1. Writing all small function using AVX2 SIMD intrinsics. Yet it seems I gain only 3-5% over the compiler (I use Intel Compiler).
  2. Using the #pragma vector aligned to force compiler to vectorize the code and assume no aliasing and no alignment issues. It yields results which are 3-5% slower than the hand tuned I did (See 1).
  3. Disabling OpenMP (Didn't improve).
  4. Various compilers (MSVC, GCC 7.3 / 8.1, Intel Compiler 2018 [Was best]). Results above are the best achieved (Intel Compiler).
  1. Writing all small function using AVX2 SIMD intrinsics. Yet it seems I gain only 3-5% over the compiler (I use Intel Compiler).
  2. Using the #pragma vector aligned to force compiler to vectorize the code and assume no aliasing and no alignment issues. It yields results which are 3-5% slower than the hand tuned I did (See 1).
  3. Disabling OpenMP (Didn't improve).
  4. Various compilers (MSVC, GCC 7.3 / 8.1, Intel Compiler 2018 [Was best]). Results above are the best achieved (Intel Compiler). I tried -Ofast and -O3 on GCC. Using O3 on Intel. FP Precision is set to fast.
deleted 1 character in body
Source Link
Royi
  • 582
  • 6
  • 21
Loading
Rollback to Revision 5
Source Link
Royi
  • 582
  • 6
  • 21
Loading
deleted 64 characters in body; edited title
Source Link
Jamal
  • 35.2k
  • 13
  • 134
  • 238
Loading
Tweeted twitter.com/StackCodeReview/status/1038532681531895808
edited body
Source Link
Royi
  • 582
  • 6
  • 21
Loading
added 202 characters in body
Source Link
Royi
  • 582
  • 6
  • 21
Loading
Rollback to Revision 1
Source Link
Royi
  • 582
  • 6
  • 21
Loading
deleted 26 characters in body; edited tags; edited title
Source Link
200_success
  • 145.7k
  • 22
  • 191
  • 481
Loading
Source Link
Royi
  • 582
  • 6
  • 21
Loading