Based on code by Chris Elrod I managed to come to this:
function Conv1D!( vO :: Vector{T}, vA :: Vector{T}, vB :: Vector{T} ) where {T <: Real}
J = length(vA);
K = length(vB); #<! Assumed to be the Kernel
# Optimized for the case the kernel is in vB (Shorter)
J < K && return Conv1D!(vO, vB, vA);
I = J + K - 1; #<! Output length
@turbo for ii in 01:(K - 1) #<! Head
sumVal = zero(T);
for kk in 1:K
ib0 = (ii >= kk);
oa = ib0 ? vA[ii - kk + 1] : zero(T);
sumVal += vB[kk] * oa;
end
vO[ii] = sumVal;
end
@turbo inline=true for ii in K:(J - 1) #<! Middle
sumVal = zero(T);
for kk in 1:K
sumVal += vB[kk] * vA[ii - kk + 1];
end
vO[ii] = sumVal;
end
@turbo for ii in J:I #<! Tail
sumVal = zero(T);
for kk in 1:K
ib0 = (ii < J + kk);
oa = ib0 ? vA[ii - kk + 1] : zero(T);
sumVal += vB[kk] * oa;
end
vO[ii] = sumVal;
end
return vO
end
This code will efficiently use the SIMD capabilities of the CPU (Assuming x64 CPU).
The main idea is to replace the series of if with 3 optimized cases for beginning of the signal, middle and the end.
This makes the code SIMD friendly.
Pay attention that it won't work well with @simd instead of @turbo.