I want to implement a 1D convolution in Julia using the direct calculation since the conv() function in DSP.jl uses DFT (fft) based methods.
In order to make it fast, I'd like the code to be SIMD friendly (I am OK with using LoopVectorization.jl).
The trivial code is:
using BenchmarkTools;
function _Conv1D!( vO :: Array{T, 1}, vA :: Array{T, 1}, vB :: Array{T, 1} ) :: Array{T, 1} where {T <: Real}
lenA = length(vA);
lenB = length(vB);
fill!(vO, zero(T));
for idxB in 1:lenB
@simd for idxA in 1:lenA
@inbounds vO[idxA + idxB - 1] += vA[idxA] * vB[idxB];
end
end
return vO;
end
function _Conv1D( vA :: Array{T, 1}, vB :: Array{T, 1} ) :: Array{T, 1} where {T <: Real}
lenA = length(vA);
lenB = length(vB);
vO = Array{T, 1}(undef, lenA + lenB - 1);
return _Conv1D!(vO, vA, vB);
end
numSamplesA = 1000;
numSamplesB = 15;
vA = rand(numSamplesA);
vB = rand(numSamplesB);
vO = rand(numSamplesA + numSamplesB - 1);
@benchmark _Conv1D($vA, $vB)
I get:
BenchmarkTools.Trial: 10000 samples with 9 evaluations.
Range (min … max): 2.522 μs … 558.844 μs ┊ GC (min … max): 0.00% … 99.28%
Time (median): 3.333 μs ┊ GC (median): 0.00%
Time (mean ± σ): 4.321 μs ± 13.718 μs ┊ GC (mean ± σ): 10.38% ± 3.67%
██▆ ▂
▃▄▃███▆▅▇▆▄▂▃▇██▄▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▃▃▃▃▂▂▁▁▁▁▁▁▁▁ ▂
2.52 μs Histogram: frequency by time 8.51 μs <
Memory estimate: 8.06 KiB, allocs estimate: 1.
I read Chris Elrod's post Orthogonalize Indices. Basically I tried removing the inner loop and got something like:
function Conv1D!( vO :: Array{T, 1}, vA :: Array{T, 1}, vB :: Array{T, 1} ) :: Array{T, 1} where {T <: Real}
lenA = length(vA);
lenB = length(vB);
lenO = length(vO);
vC = view(vB, lenB:-1:1);
@simd for ii in 1:lenO
# Rolling vB over vA
startIdxA = max(1, ii - lenB + 1);
endIdxA = min(lenA, ii);
startIdxC = max(lenB - ii + 1, 1);
endIdxC = min(lenB, lenO - ii + 1);
# println("startA = $startIdxA, endA = $endIdxA, startC = $startIdxC, endC = $endIdxC");
@inbounds vO[ii] = sum(view(vA, startIdxA:endIdxA) .* view(vC, startIdxC:endIdxC));
end
return vO;
end
function Conv1D( vA :: Array{T, 1}, vB :: Array{T, 1} ) :: Array{T, 1} where {T <: Real}
lenA = length(vA);
lenB = length(vB);
vO = Array{T, 1}(undef, lenA + lenB - 1);
return Conv1D!(vO, vA, vB);
end
numSamplesA = 1000;
numSamplesB = 15;
vA = rand(numSamplesA);
vB = rand(numSamplesB);
vO = rand(numSamplesA + numSamplesB - 1);
@benchmark Conv1D($vA, $vB)
Yet I get much much slower results.
Is there anything I can do to improve results any farther? Maybe something to make the code much more SIMD friendly?
A Godbold link for _Conv1D!(): https://godbolt.org/z/e8W7e473h.
Remark: Originally, in Conv1D!(), I used @inbounds vO[ii] = dot(view(vA, startIdxA:endIdxA), view(vC, startIdxC:endIdxC));. It was slower.
195.214 μs (0 allocations: 0 bytes)B:1.700 ms (1999 allocations: 7.90 MiB)You're generating a load of temporary arrays. \$\endgroup\$