filt perf optimizations (#516)

* create `si` only once, and reuse * no inbounds faster for N < 24 * @generated `_small_filt_fir!` - use Base.Cartesian.@ntuple * inbounds if N > 18 major speedups for N > 18, much slower if inbounds for small N * reorder store in `_filt_fir` and `_filt_iir` >20% faster for fir * maybe prevent undefvarerror no need to assign values to enum * small things - eachindex stuff - `==` to `===` nothing - `npairs = max(nz, np)` * isone, iszero, etc. * `filt` for `FIRFilter`: remove redundant definitions * extract type instability from loop * remove fft conv type instability - in `unsafe_conv_kern_os!` - probably no real impact * correct `filt` arg types * nicer loop * inv plan * cleanup filt_order * use `muladd`s in filt.jl * `_prod_freq` * more `muladd`s * `fill!`, `reverse`, don't broadcast scalar / * micro-optimizations, cosmetics - reduced exceptions in `@code_llvm` for `N < 18` - `@noinline mul!`, `broadcast`, only for julia > v1.8 - name `SMALL_FILT_VECT_CUTOFF`, selective bounds check * Compat noinline * add test for error (cherry picked from commit 9de4c0c)
JuliaDSP · Sep 13, 2024 · d2e69ee · d2e69ee
1 parent 26ca32e
commit d2e69ee
Showing 12 changed files with 257 additions and 293 deletions.
diff --git a/src/DSP.jl b/src/DSP.jl
@@ -3,6 +3,7 @@ module DSP
 using FFTW
 using LinearAlgebra: mul!, rmul!
 using IterTools: subsets
+using Compat: Compat
 
 export conv, deconv, filt, filt!, xcorr
 

diff --git a/src/Filters/design.jl b/src/Filters/design.jl
@@ -86,9 +86,7 @@ function Chebyshev2(::Type{T}, n::Integer, ripple::Real) where {T<:Real}
 
     ε = 1/sqrt(10^(convert(T, ripple)/10)-1)
     p = chebyshev_poles(T, n, ε)
-    for i = 1:length(p)
-        p[i] = inv(p[i])
-    end
+    map!(inv, p, p)
 
     z = zeros(Complex{T}, n-isodd(n))
     k = one(T)
@@ -155,7 +153,7 @@ function asne(w::Number, k::Real)
         # Eq. (50)
         k = abs2(k/(1+sqrt(1-abs2(k))))
         # Eq. (56)
-        w = 2*w/((1+k)*(1+sqrt(1-abs2(kold)*w^2)))
+        w = 2w / ((1 + k) * (1 + sqrt(muladd(-abs2(kold), w^2, 1))))
     end
     2*asin(w)/π
 end
@@ -355,9 +353,9 @@ function transform_prototype(ftype::Bandpass, proto::ZeroPoleGain{:s})
     newz = zeros(TR, 2*nz+np-ncommon)
     newp = zeros(TR, 2*np+nz-ncommon)
     for (oldc, newc) in ((p, newp), (z, newz))
-        for i = 1:length(oldc)
+        for i in eachindex(oldc)
             b = oldc[i] * ((ftype.w2 - ftype.w1)/2)
-            pm = sqrt(b^2 - ftype.w2 * ftype.w1)
+            pm = sqrt(muladd(-ftype.w2, ftype.w1, b^2))
             newc[2i-1] = b + pm
             newc[2i] = b - pm
         end
@@ -372,25 +370,25 @@ function transform_prototype(ftype::Bandstop, proto::ZeroPoleGain{:s})
     k = proto.k
     nz = length(z)
     np = length(p)
-    npairs = nz+np-min(nz, np)
+    npairs = max(nz, np)
     TR = Base.promote_eltype(z, p)
     newz = Vector{TR}(undef, 2*npairs)
     newp = Vector{TR}(undef, 2*npairs)
 
     num = one(eltype(z))
     for i = 1:nz
         num *= -z[i]
-        b = (ftype.w2 - ftype.w1)/2/z[i]
-        pm = sqrt(b^2 - ftype.w2 * ftype.w1)
+        b = (ftype.w2 - ftype.w1)/2z[i]
+        pm = sqrt(muladd(-ftype.w2, ftype.w1, b^2))
         newz[2i-1] = b - pm
         newz[2i] = b + pm
     end
 
     den = one(eltype(p))
     for i = 1:np
         den *= -p[i]
-        b = (ftype.w2 - ftype.w1)/2/p[i]
-        pm = sqrt(b^2 - ftype.w2 * ftype.w1)
+        b = (ftype.w2 - ftype.w1)/2p[i]
+        pm = sqrt(muladd(-ftype.w2, ftype.w1, b^2))
         newp[2i-1] = b - pm
         newp[2i] = b + pm
     end

diff --git a/src/Filters/filt.jl b/src/Filters/filt.jl
@@ -6,7 +6,7 @@
 
 
 ## PolynomialRatio
-_zerosi(f::PolynomialRatio{:z,T}, x::AbstractArray{S}) where {T,S} =
+_zerosi(f::PolynomialRatio{:z,T}, ::AbstractArray{S}) where {T,S} =
     zeros(promote_type(T, S), max(-firstindex(f.a), -firstindex(f.b)))
 
 """
@@ -35,7 +35,7 @@ selected based on the data and filter length.
 filt(f::PolynomialRatio{:z}, x, si=_zerosi(f, x)) = filt(coefb(f), coefa(f), x, si)
 
 ## SecondOrderSections
-_zerosi(f::SecondOrderSections{:z,T,G}, x::AbstractArray{S}) where {T,G,S} =
+_zerosi(f::SecondOrderSections{:z,T,G}, ::AbstractArray{S}) where {T,G,S} =
     zeros(promote_type(T, G, S), 2, length(f.biquads))
 
 # filt! algorithm (no checking, returns si)
@@ -44,14 +44,14 @@ function _filt!(out::AbstractArray, si::AbstractArray{S,N}, f::SecondOrderSectio
     g = f.g
     biquads = f.biquads
     n = length(biquads)
-    @inbounds for i = 1:size(x, 1)
+    @inbounds for i in axes(x, 1)
         yi = x[i, col]
         for fi = 1:n
             biquad = biquads[fi]
             xi = yi
-            yi = si[1, fi] + biquad.b0*xi
-            si[1, fi] = si[2, fi] + biquad.b1*xi - biquad.a1*yi
-            si[2, fi] = biquad.b2*xi - biquad.a2*yi
+            yi = muladd(biquad.b0, xi, si[1, fi])
+            si[1, fi] = muladd(biquad.a1, -yi, muladd(biquad.b1, xi, si[2, fi]))
+            si[2, fi] = muladd(biquad.b2, xi, -biquad.a2 * yi)
         end
         out[i, col] = yi*g
     end
@@ -80,17 +80,17 @@ filt(f::SecondOrderSections{:z,T,G}, x::AbstractArray{S}, si=_zerosi(f, x)) wher
     filt!(Array{promote_type(T, G, S)}(undef, size(x)), f, x, si)
 
 ## Biquad
-_zerosi(f::Biquad{:z,T}, x::AbstractArray{S}) where {T,S} =
+_zerosi(::Biquad{:z,T}, ::AbstractArray{S}) where {T,S} =
     zeros(promote_type(T, S), 2)
 
 # filt! algorithm (no checking, returns si)
 function _filt!(out::AbstractArray, si1::Number, si2::Number, f::Biquad{:z},
                 x::AbstractArray, col::Int)
-    @inbounds for i = 1:size(x, 1)
+    @inbounds for i in axes(x, 1)
         xi = x[i, col]
-        yi = si1 + f.b0*xi
-        si1 = si2 + f.b1*xi - f.a1*yi
-        si2 = f.b2*xi - f.a2*yi
+        yi = muladd(f.b0, xi, si1)
+        si1 = muladd(f.a1, -yi, muladd(f.b1, xi, si2))
+        si2 = muladd(f.b2, xi, -f.a2 * yi)
         out[i, col] = yi
     end
     (si1, si2)
@@ -105,9 +105,8 @@ function filt!(out::AbstractArray, f::Biquad{:z}, x::AbstractArray,
     (size(si, 1) != 2 || (N > 1 && Base.trailingsize(si, 2) != ncols)) &&
         error("si must have two rows and 1 or nsignals columns")
 
-    initial_si = si
     for col = 1:ncols
-        _filt!(out, initial_si[1, N > 1 ? col : 1], initial_si[2, N > 1 ? col : 1], f, x, col)
+        _filt!(out, si[1, N > 1 ? col : 1], si[2, N > 1 ? col : 1], f, x, col)
     end
     out
 end
@@ -170,13 +169,13 @@ function filt!(out::AbstractVector, f::DF2TFilter{<:PolynomialRatio,<:Vector}, x
     if n == 1
         mul!(out, x, b[1])
     else
-        @inbounds for i=1:length(x)
+        @inbounds for i in eachindex(x, out)
             xi = x[i]
-            val = si[1] + b[1]*xi
+            val = muladd(b[1], xi, si[1])
             for j=2:n-1
-                si[j-1] = si[j] + b[j]*xi - a[j]*val
+                si[j-1] = muladd(a[j], -val, muladd(b[j], xi, si[j]))
             end
-            si[n-1] = b[n]*xi - a[n]*val
+            si[n-1] = muladd(b[n], xi, -a[n] * val)
             out[i] = val
         end
     end
@@ -200,13 +199,13 @@ DF2TFilter(coef::Biquad{:z,T}, state::Vector{S}=zeros(T, 2)) where {T,S} =
 function filt!(out::AbstractVector, f::DF2TFilter{<:Biquad,<:Vector}, x::AbstractVector)
     length(x) != length(out) && throw(ArgumentError("out size must match x"))
     si = f.state
-    (si[1], si[2]) =_filt!(out, si[1], si[2], f.coef, x, 1)
+    (si[1], si[2]) = _filt!(out, si[1], si[2], f.coef, x, 1)
     out
 end
 
 # Variant that allocates the output
-filt(f::DF2TFilter{<:FilterCoefficients{:z},S}, x::AbstractVector) where {S<:Array} =
-    filt!(Vector{eltype(S)}(undef, length(x)), f, x)
+filt(f::DF2TFilter{<:FilterCoefficients{:z},<:Array{T}}, x::AbstractVector) where {T} =
+    filt!(Vector{T}(undef, length(x)), f, x)
 
 # Fall back to SecondOrderSections
 DF2TFilter(coef::FilterCoefficients{:z}) = DF2TFilter(convert(SecondOrderSections, coef))
@@ -346,7 +345,7 @@ filtfilt(f::PolynomialRatio{:z}, x) = filtfilt(coefb(f), coefa(f), x)
 # response to a step function is steady state.
 function filt_stepstate(b::Union{AbstractVector{T}, T}, a::Union{AbstractVector{T}, T}) where T<:Number
     scale_factor = a[1]
-    if scale_factor != 1.0
+    if !isone(scale_factor)
         a = a ./ scale_factor
         b = b ./ scale_factor
     end
@@ -362,8 +361,8 @@ function filt_stepstate(b::Union{AbstractVector{T}, T}, a::Union{AbstractVector{
     as<sz && (a = copyto!(zeros(eltype(a), sz), a))
 
     # construct the companion matrix A and vector B:
-    A = [-a[2:end] [I; zeros(T, 1, sz-2)]]
-    B = b[2:end] - a[2:end] * b[1]
+    A = [-a[2:end] Matrix{T}(I, sz-1, sz-2)]
+    B = @views @. muladd(a[2:end], -b[1], b[2:end])
     # Solve si = A*si + B
     # (I - A)*si = B
     scale_factor \ (I - A) \ B
@@ -375,18 +374,18 @@ function filt_stepstate(f::SecondOrderSections{:z,T}) where T
     y = one(T)
     for i = 1:length(biquads)
         biquad = biquads[i]
+        a1, a2, b0, b1, b2 = biquad.a1, biquad.a2, biquad.b0, biquad.b1, biquad.b2
 
         # At steady state, we have:
         #  y = s1 + b0*x
         # s1 = s2 + b1*x - a1*y
         # s2 = b2*x - a2*y
         # where x is the input and y is the output. Solving these
         # equations yields the following.
-        si[1, i] = (-(biquad.a1 + biquad.a2)*biquad.b0 + biquad.b1 + biquad.b2)/
-                   (1 + biquad.a1 + biquad.a2)*y
-        si[2, i] = (biquad.a1*biquad.b2 - biquad.a2*(biquad.b0 + biquad.b1) + biquad.b2)/
-                   (1 + biquad.a1 + biquad.a2)*y
-        y *= (biquad.b0 + biquad.b1 + biquad.b2)/(1 + biquad.a1 + biquad.a2)
+        den = (1 + a1 + a2)
+        si[1, i] = muladd((a1 + a2), -b0, b1 + b2) / den * y
+        si[2, i] = muladd(a1, b2, muladd(-a2, (b0 + b1), b2)) / den * y
+        y *= (b0 + b1 + b2) / den
     end
     si
 end
@@ -397,8 +396,8 @@ end
 Apply filter or filter coefficients `h` along the first dimension
 of array `x` using a naïve time-domain algorithm
 """
-function tdfilt(h::AbstractVector, x::AbstractArray{T}) where T<:Real
-    filt!(Array{T}(undef, size(x)), h, ones(eltype(h), 1), x)
+function tdfilt(h::AbstractVector{H}, x::AbstractArray{T}) where {H,T<:Real}
+    filt(h, one(H), x)
 end
 
 """
@@ -407,12 +406,12 @@ end
 Like `tdfilt`, but writes the result into array `out`. Output array `out` may
 not be an alias of `x`, i.e. filtering may not be done in place.
 """
-function tdfilt!(out::AbstractArray, h::AbstractVector, x::AbstractArray)
-    filt!(out, h, ones(eltype(h), 1), x)
+function tdfilt!(out::AbstractArray, h::AbstractVector{H}, x::AbstractArray) where H
+    filt!(out, h, one(H), x)
 end
 
-filt(h::AbstractArray, x::AbstractArray) =
-    filt!(Array{eltype(x)}(undef, size(x)), h, x)
+filt(h::AbstractVector{H}, x::AbstractArray{T}) where {H,T} =
+    filt!(Array{promote_type(H, T)}(undef, size(x)), h, x)
 
 #
 # fftfilt and filt
@@ -447,48 +446,44 @@ end
 # Like fftfilt! but does not check if out and x are the same size
 function _fftfilt!(
     out::AbstractArray{<:Real},
-    b::AbstractVector{<:Real},
+    b::AbstractVector{H},
     x::AbstractArray{T},
     nfft::Integer
-) where T<:Real
+) where {T<:Real,H<:Real}
     nb = length(b)
     nx = size(x, 1)
-    normfactor = 1/nfft
+    normfactor = nfft
+    W = promote_type(H, T)
 
     L = min(nx, nfft - (nb - 1))
-    tmp1 = Vector{T}(undef, nfft)
-    tmp2 = Vector{Complex{T}}(undef, nfft >> 1 + 1)
+    tmp1 = Vector{W}(undef, nfft)
+    tmp2 = Vector{Complex{W}}(undef, nfft >> 1 + 1)
 
     p1 = plan_rfft(tmp1)
     p2 = plan_brfft(tmp2, nfft)
 
     # FFT of filter
     filterft = similar(tmp2)
-    tmp1[1:nb] .= b .* normfactor
-    tmp1[nb+1:end] .= zero(T)
+    tmp1[1:nb] .= b ./ normfactor
+    tmp1[nb+1:end] .= zero(W)
     mul!(filterft, p1, tmp1)
 
     # FFT of chunks
-    for colstart = 0:nx:length(x)-1
-        off = 1
-        while off <= nx
-            npadbefore = max(0, nb - off)
-            xstart = off - nb + npadbefore + 1
-            n = min(nfft - npadbefore, nx - xstart + 1)
+    for colstart = 0:nx:length(x)-1, off = 1:L:nx
+        npadbefore = max(0, nb - off)
+        xstart = off - nb + npadbefore + 1
+        n = min(nfft - npadbefore, nx - xstart + 1)
 
-            tmp1[1:npadbefore] .= zero(T)
-            tmp1[npadbefore+n+1:end] .= zero(T)
+        tmp1[1:npadbefore] .= zero(W)
+        tmp1[npadbefore+n+1:end] .= zero(W)
 
-            copyto!(tmp1, npadbefore+1, x, colstart+xstart, n)
-            mul!(tmp2, p1, tmp1)
-            broadcast!(*, tmp2, tmp2, filterft)
-            mul!(tmp1, p2, tmp2)
+        copyto!(tmp1, npadbefore+1, x, colstart+xstart, n)
+        mul!(tmp2, p1, tmp1)
+        broadcast!(*, tmp2, tmp2, filterft)
+        mul!(tmp1, p2, tmp2)
 
-            # Copy to output
-            copyto!(out, colstart+off, tmp1, nb, min(L, nx - off + 1))
-
-            off += L
-        end
+        # Copy to output
+        copyto!(out, colstart+off, tmp1, nb, min(L, nx - off + 1))
     end
 
     out
@@ -524,6 +519,6 @@ function filt_choose_alg!(
     end
 end
 
-function filt_choose_alg!(out::AbstractArray, b::AbstractArray, x::AbstractArray)
+function filt_choose_alg!(out::AbstractArray, b::AbstractVector, x::AbstractArray)
     tdfilt!(out, b, x)
 end