Revisions to Implementing a 1D Convolution SIMD Friendly in Julia

edited body

Source Link

edited Jul 10, 2023 at 10:25

Royi

582
6
21

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 01:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 1:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

added 162 characters in body

Source Link

edited May 19, 2023 at 6:03

Royi

582
6
21

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.

Pay attention that it won't work well with @simd instead of @turbo.

added 57 characters in body

Source Link

edited May 13, 2023 at 18:27

Royi

582
6
21

Based on code by Chris ElrodChris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Based on code by Chris Elrod I managed to come to this:

function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}

    J = length(vA);
    K = length(vB); #<! Assumed to be the Kernel
    
    # Optimized for the case the kernel is in vB (Shorter)
    J < K && return Conv1D!(vO, vB, vA);
    
    I = J + K - 1; #<! Output length
    
    @turbo for ii in 0:(K - 1) #<! Head
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii >= kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    @turbo inline=true for ii in K:(J - 1) #<! Middle
        sumVal = zero(T);
        for kk in 1:K
            sumVal += vB[kk] * vA[ii - kk + 1];
        end
        vO[ii] = sumVal;
    end
    @turbo for ii in J:I #<! Tail
        sumVal = zero(T);
        for kk in 1:K
            ib0 = (ii < J + kk);
            oa = ib0 ? vA[ii - kk + 1] : zero(T);
            sumVal += vB[kk] * oa;
        end
        vO[ii] = sumVal;
    end
    return vO
end

This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).

Pay attention that it won't work well with @simd instead of @turbo.

Source Link

answered May 13, 2023 at 15:34

Royi

582
6
21

Loading

Stack Exchange Network

Return to Answer