Skip to main content
edited body
Source Link
Royi
  • 582
  • 6
  • 21

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 01:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 1:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

added 162 characters in body
Source Link
Royi
  • 582
  • 6
  • 21

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

added 57 characters in body
Source Link
Royi
  • 582
  • 6
  • 21

Based on code by Chris ElrodChris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Source Link
Royi
  • 582
  • 6
  • 21
Loading