filt perf optimizations (#516)

* create `si` only once, and reuse * no inbounds faster for N < 24 * @generated `_small_filt_fir!` - use Base.Cartesian.@ntuple * inbounds if N > 18 major speedups for N > 18, much slower if inbounds for small N * reorder store in `_filt_fir` and `_filt_iir` >20% faster for fir * maybe prevent undefvarerror no need to assign values to enum * small things - eachindex stuff - `==` to `===` nothing - `npairs = max(nz, np)` * isone, iszero, etc. * `filt` for `FIRFilter`: remove redundant definitions * extract type instability from loop * remove fft conv type instability - in `unsafe_conv_kern_os!` - probably no real impact * correct `filt` arg types * nicer loop * inv plan * cleanup filt_order * use `muladd`s in filt.jl * `_prod_freq` * more `muladd`s * `fill!`, `reverse`, don't broadcast scalar / * micro-optimizations, cosmetics - reduced exceptions in `@code_llvm` for `N < 18` - `@noinline mul!`, `broadcast`, only for julia > v1.8 - name `SMALL_FILT_VECT_CUTOFF`, selective bounds check * Compat noinline * add test for error (cherry picked from commit 9de4c0c)
JuliaDSP · Sep 13, 2024 · 1e72e21 · 1e72e21
1 parent 1737340
commit 1e72e21
Showing 12 changed files with 257 additions and 293 deletions.
diff --git a/src/DSP.jl b/src/DSP.jl
@@ -3,6 +3,7 @@ module DSP
 using FFTW
 using LinearAlgebra: mul!, rmul!
 using IterTools: subsets
+using Compat: Compat
 
 export conv, deconv, filt, filt!, xcorr
 

diff --git a/src/Filters/design.jl b/src/Filters/design.jl
@@ -86,9 +86,7 @@ function Chebyshev2(::Type{T}, n::Integer, ripple::Real) where {T<:Real}
 
     ε = 1/sqrt(10^(convert(T, ripple)/10)-1)
     p = chebyshev_poles(T, n, ε)
-    for i = 1:length(p)
-        p[i] = inv(p[i])
-    end
+    map!(inv, p, p)
 
     z = zeros(Complex{T}, n-isodd(n))
     k = one(T)
@@ -155,7 +153,7 @@ function asne(w::Number, k::Real)
         # Eq. (50)
         k = abs2(k/(1+sqrt(1-abs2(k))))
         # Eq. (56)
-        w = 2*w/((1+k)*(1+sqrt(1-abs2(kold)*w^2)))
+        w = 2w / ((1 + k) * (1 + sqrt(muladd(-abs2(kold), w^2, 1))))
     end
     2*asin(w)/π
 end
@@ -355,9 +353,9 @@ function transform_prototype(ftype::Bandpass, proto::ZeroPoleGain{:s})
     newz = zeros(TR, 2*nz+np-ncommon)
     newp = zeros(TR, 2*np+nz-ncommon)
     for (oldc, newc) in ((p, newp), (z, newz))
-        for i = 1:length(oldc)
+        for i in eachindex(oldc)
             b = oldc[i] * ((ftype.w2 - ftype.w1)/2)
-            pm = sqrt(b^2 - ftype.w2 * ftype.w1)
+            pm = sqrt(muladd(-ftype.w2, ftype.w1, b^2))
             newc[2i-1] = b + pm
             newc[2i] = b - pm
         end
@@ -372,25 +370,25 @@ function transform_prototype(ftype::Bandstop, proto::ZeroPoleGain{:s})
     k = proto.k
     nz = length(z)
     np = length(p)
-    npairs = nz+np-min(nz, np)
+    npairs = max(nz, np)
     TR = Base.promote_eltype(z, p)
     newz = Vector{TR}(undef, 2*npairs)
     newp = Vector{TR}(undef, 2*npairs)
 
     num = one(eltype(z))
     for i = 1:nz
         num *= -z[i]
-        b = (ftype.w2 - ftype.w1)/2/z[i]
-        pm = sqrt(b^2 - ftype.w2 * ftype.w1)
+        b = (ftype.w2 - ftype.w1)/2z[i]
+        pm = sqrt(muladd(-ftype.w2, ftype.w1, b^2))
         newz[2i-1] = b - pm
         newz[2i] = b + pm
     end
 
     den = one(eltype(p))
     for i = 1:np
         den *= -p[i]
-        b = (ftype.w2 - ftype.w1)/2/p[i]
-        pm = sqrt(b^2 - ftype.w2 * ftype.w1)
+        b = (ftype.w2 - ftype.w1)/2p[i]
+        pm = sqrt(muladd(-ftype.w2, ftype.w1, b^2))
         newp[2i-1] = b - pm
         newp[2i] = b + pm
     end

diff --git a/src/Filters/filt.jl b/src/Filters/filt.jl
@@ -6,7 +6,7 @@
 
 
 ## PolynomialRatio
-_zerosi(f::PolynomialRatio{:z,T}, x::AbstractArray{S}) where {T,S} =
+_zerosi(f::PolynomialRatio{:z,T}, ::AbstractArray{S}) where {T,S} =
     zeros(promote_type(T, S), max(-firstindex(f.a), -firstindex(f.b)))
 
 """
@@ -35,7 +35,7 @@ selected based on the data and filter length.
 filt(f::PolynomialRatio{:z}, x, si=_zerosi(f, x)) = filt(coefb(f), coefa(f), x, si)
 
 ## SecondOrderSections
-_zerosi(f::SecondOrderSections{:z,T,G}, x::AbstractArray{S}) where {T,G,S} =
+_zerosi(f::SecondOrderSections{:z,T,G}, ::AbstractArray{S}) where {T,G,S} =
     zeros(promote_type(T, G, S), 2, length(f.biquads))
 
 # filt! algorithm (no checking, returns si)
@@ -44,14 +44,14 @@ function _filt!(out::AbstractArray, si::AbstractArray{S,N}, f::SecondOrderSectio
     g = f.g
     biquads = f.biquads
     n = length(biquads)
-    @inbounds for i = 1:size(x, 1)
+    @inbounds for i in axes(x, 1)
         yi = x[i, col]
         for fi = 1:n
             biquad = biquads[fi]
             xi = yi
-            yi = si[1, fi] + biquad.b0*xi
-            si[1, fi] = si[2, fi] + biquad.b1*xi - biquad.a1*yi
-            si[2, fi] = biquad.b2*xi - biquad.a2*yi
+            yi = muladd(biquad.b0, xi, si[1, fi])
+            si[1, fi] = muladd(biquad.a1, -yi, muladd(biquad.b1, xi, si[2, fi]))
+            si[2, fi] = muladd(biquad.b2, xi, -biquad.a2 * yi)
         end
         out[i, col] = yi*g
     end
@@ -80,17 +80,17 @@ filt(f::SecondOrderSections{:z,T,G}, x::AbstractArray{S}, si=_zerosi(f, x)) wher
     filt!(Array{promote_type(T, G, S)}(undef, size(x)), f, x, si)
 
 ## Biquad
-_zerosi(f::Biquad{:z,T}, x::AbstractArray{S}) where {T,S} =
+_zerosi(::Biquad{:z,T}, ::AbstractArray{S}) where {T,S} =
     zeros(promote_type(T, S), 2)
 
 # filt! algorithm (no checking, returns si)
 function _filt!(out::AbstractArray, si1::Number, si2::Number, f::Biquad{:z},
                 x::AbstractArray, col::Int)
-    @inbounds for i = 1:size(x, 1)
+    @inbounds for i in axes(x, 1)
         xi = x[i, col]
-        yi = si1 + f.b0*xi
-        si1 = si2 + f.b1*xi - f.a1*yi
-        si2 = f.b2*xi - f.a2*yi
+        yi = muladd(f.b0, xi, si1)
+        si1 = muladd(f.a1, -yi, muladd(f.b1, xi, si2))
+        si2 = muladd(f.b2, xi, -f.a2 * yi)
         out[i, col] = yi
     end
     (si1, si2)
@@ -105,9 +105,8 @@ function filt!(out::AbstractArray, f::Biquad{:z}, x::AbstractArray,
     (size(si, 1) != 2 || (N > 1 && Base.trailingsize(si, 2) != ncols)) &&
         error("si must have two rows and 1 or nsignals columns")
 
-    initial_si = si
     for col = 1:ncols
-        _filt!(out, initial_si[1, N > 1 ? col : 1], initial_si[2, N > 1 ? col : 1], f, x, col)
+        _filt!(out, si[1, N > 1 ? col : 1], si[2, N > 1 ? col : 1], f, x, col)
     end
     out
 end
@@ -170,13 +169,13 @@ function filt!(out::AbstractVector, f::DF2TFilter{<:PolynomialRatio,<:Vector}, x
     if n == 1
         mul!(out, x, b[1])
     else
-        @inbounds for i=1:length(x)
+        @inbounds for i in eachindex(x, out)
             xi = x[i]
-            val = si[1] + b[1]*xi
+            val = muladd(b[1], xi, si[1])
             for j=2:n-1
-                si[j-1] = si[j] + b[j]*xi - a[j]*val
+                si[j-1] = muladd(a[j], -val, muladd(b[j], xi, si[j]))
             end
-            si[n-1] = b[n]*xi - a[n]*val
+            si[n-1] = muladd(b[n], xi, -a[n] * val)
             out[i] = val
         end
     end
@@ -200,13 +199,13 @@ DF2TFilter(coef::Biquad{:z,T}, state::Vector{S}=zeros(T, 2)) where {T,S} =
 function filt!(out::AbstractVector, f::DF2TFilter{<:Biquad,<:Vector}, x::AbstractVector)
     length(x) != length(out) && throw(ArgumentError("out size must match x"))
     si = f.state
-    (si[1], si[2]) =_filt!(out, si[1], si[2], f.coef, x, 1)
+    (si[1], si[2]) = _filt!(out, si[1], si[2], f.coef, x, 1)
     out
 end
 
 # Variant that allocates the output
-filt(f::DF2TFilter{<:FilterCoefficients{:z},S}, x::AbstractVector) where {S<:Array} =
-    filt!(Vector{eltype(S)}(undef, length(x)), f, x)
+filt(f::DF2TFilter{<:FilterCoefficients{:z},<:Array{T}}, x::AbstractVector) where {T} =
+    filt!(Vector{T}(undef, length(x)), f, x)
 
 # Fall back to SecondOrderSections
 DF2TFilter(coef::FilterCoefficients{:z}) = DF2TFilter(convert(SecondOrderSections, coef))
@@ -346,7 +345,7 @@ filtfilt(f::PolynomialRatio{:z}, x) = filtfilt(coefb(f), coefa(f), x)
 # response to a step function is steady state.
 function filt_stepstate(b::Union{AbstractVector{T}, T}, a::Union{AbstractVector{T}, T}) where T<:Number
     scale_factor = a[1]
-    if scale_factor != 1.0
+    if !isone(scale_factor)
         a = a ./ scale_factor
         b = b ./ scale_factor
     end
@@ -362,8 +361,8 @@ function filt_stepstate(b::Union{AbstractVector{T}, T}, a::Union{AbstractVector{
     as<sz && (a = copyto!(zeros(eltype(a), sz), a))
 
     # construct the companion matrix A and vector B:
-    A = [-a[2:end] [I; zeros(T, 1, sz-2)]]
-    B = b[2:end] - a[2:end] * b[1]
+    A = [-a[2:end] Matrix{T}(I, sz-1, sz-2)]
+    B = @views @. muladd(a[2:end], -b[1], b[2:end])
     # Solve si = A*si + B
     # (I - A)*si = B
     scale_factor \ (I - A) \ B
@@ -375,18 +374,18 @@ function filt_stepstate(f::SecondOrderSections{:z,T}) where T
     y = one(T)
     for i = 1:length(biquads)
         biquad = biquads[i]
+        a1, a2, b0, b1, b2 = biquad.a1, biquad.a2, biquad.b0, biquad.b1, biquad.b2
 
         # At steady state, we have:
         #  y = s1 + b0*x
         # s1 = s2 + b1*x - a1*y
         # s2 = b2*x - a2*y
         # where x is the input and y is the output. Solving these
         # equations yields the following.
-        si[1, i] = (-(biquad.a1 + biquad.a2)*biquad.b0 + biquad.b1 + biquad.b2)/
-                   (1 + biquad.a1 + biquad.a2)*y
-        si[2, i] = (biquad.a1*biquad.b2 - biquad.a2*(biquad.b0 + biquad.b1) + biquad.b2)/
-                   (1 + biquad.a1 + biquad.a2)*y
-        y *= (biquad.b0 + biquad.b1 + biquad.b2)/(1 + biquad.a1 + biquad.a2)
+        den = (1 + a1 + a2)
+        si[1, i] = muladd((a1 + a2), -b0, b1 + b2) / den * y
+        si[2, i] = muladd(a1, b2, muladd(-a2, (b0 + b1), b2)) / den * y
+        y *= (b0 + b1 + b2) / den
     end
     si
 end
@@ -397,8 +396,8 @@ end
 Apply filter or filter coefficients `h` along the first dimension
 of array `x` using a naïve time-domain algorithm
 """
-function tdfilt(h::AbstractVector, x::AbstractArray{T}) where T<:Real
-    filt!(Array{T}(undef, size(x)), h, ones(eltype(h), 1), x)
+function tdfilt(h::AbstractVector{H}, x::AbstractArray{T}) where {H,T<:Real}
+    filt(h, one(H), x)
 end
 
 """
@@ -407,12 +406,12 @@ end
 Like `tdfilt`, but writes the result into array `out`. Output array `out` may
 not be an alias of `x`, i.e. filtering may not be done in place.
 """
-function tdfilt!(out::AbstractArray, h::AbstractVector, x::AbstractArray)
-    filt!(out, h, ones(eltype(h), 1), x)
+function tdfilt!(out::AbstractArray, h::AbstractVector{H}, x::AbstractArray) where H
+    filt!(out, h, one(H), x)
 end
 
-filt(h::AbstractArray, x::AbstractArray) =
-    filt!(Array{eltype(x)}(undef, size(x)), h, x)
+filt(h::AbstractVector{H}, x::AbstractArray{T}) where {H,T} =
+    filt!(Array{promote_type(H, T)}(undef, size(x)), h, x)
 
 #
 # fftfilt and filt
@@ -447,48 +446,44 @@ end
 # Like fftfilt! but does not check if out and x are the same size
 function _fftfilt!(
     out::AbstractArray{<:Real},
-    b::AbstractVector{<:Real},
+    b::AbstractVector{H},
     x::AbstractArray{T},
     nfft::Integer
-) where T<:Real
+) where {T<:Real,H<:Real}
     nb = length(b)
     nx = size(x, 1)
-    normfactor = 1/nfft
+    normfactor = nfft
+    W = promote_type(H, T)
 
     L = min(nx, nfft - (nb - 1))
-    tmp1 = Vector{T}(undef, nfft)
-    tmp2 = Vector{Complex{T}}(undef, nfft >> 1 + 1)
+    tmp1 = Vector{W}(undef, nfft)
+    tmp2 = Vector{Complex{W}}(undef, nfft >> 1 + 1)
 
     p1 = plan_rfft(tmp1)
     p2 = plan_brfft(tmp2, nfft)
 
     # FFT of filter
     filterft = similar(tmp2)
-    tmp1[1:nb] .= b .* normfactor
-    tmp1[nb+1:end] .= zero(T)
+    tmp1[1:nb] .= b ./ normfactor
+    tmp1[nb+1:end] .= zero(W)
     mul!(filterft, p1, tmp1)
 
     # FFT of chunks
-    for colstart = 0:nx:length(x)-1
-        off = 1
-        while off <= nx
-            npadbefore = max(0, nb - off)
-            xstart = off - nb + npadbefore + 1
-            n = min(nfft - npadbefore, nx - xstart + 1)
+    for colstart = 0:nx:length(x)-1, off = 1:L:nx
+        npadbefore = max(0, nb - off)
+        xstart = off - nb + npadbefore + 1
+        n = min(nfft - npadbefore, nx - xstart + 1)
 
-            tmp1[1:npadbefore] .= zero(T)
-            tmp1[npadbefore+n+1:end] .= zero(T)
+        tmp1[1:npadbefore] .= zero(W)
+        tmp1[npadbefore+n+1:end] .= zero(W)
 
-            copyto!(tmp1, npadbefore+1, x, colstart+xstart, n)
-            mul!(tmp2, p1, tmp1)
-            broadcast!(*, tmp2, tmp2, filterft)
-            mul!(tmp1, p2, tmp2)
+        copyto!(tmp1, npadbefore+1, x, colstart+xstart, n)
+        mul!(tmp2, p1, tmp1)
+        broadcast!(*, tmp2, tmp2, filterft)
+        mul!(tmp1, p2, tmp2)
 
-            # Copy to output
-            copyto!(out, colstart+off, tmp1, nb, min(L, nx - off + 1))
-
-            off += L
-        end
+        # Copy to output
+        copyto!(out, colstart+off, tmp1, nb, min(L, nx - off + 1))
     end
 
     out
@@ -524,6 +519,6 @@ function filt_choose_alg!(
     end
 end
 
-function filt_choose_alg!(out::AbstractArray, b::AbstractArray, x::AbstractArray)
+function filt_choose_alg!(out::AbstractArray, b::AbstractVector, x::AbstractArray)
     tdfilt!(out, b, x)
 end
diff --git a/src/Filters/filt_order.jl b/src/Filters/filt_order.jl
@@ -56,32 +56,34 @@ Source Software, 2018. https://github.com/JuliaNLSolvers/Optim.jl/blob/master/sr
 
 ====================================================================#
 
+sort_W((a, b)::Tuple{Real,Real}) = (a > b) ? (b, a) : (a, b)
 
-
-toprototype(Wp::Real, Ws::Real, ftype::Type{Lowpass}) = Ws / Wp
-toprototype(Wp::Real, Ws::Real, ftype::Type{Highpass}) = Wp / Ws
-function toprototype(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, ftype::Type{Bandpass})
+toprototype(Wp::Real, Ws::Real, ::Type{Lowpass}) = Ws / Wp
+toprototype(Wp::Real, Ws::Real, ::Type{Highpass}) = Wp / Ws
+function toprototype(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, ::Type{Bandpass})
     # bandpass filter must have two corner frequencies we're computing with
-    Wa = (Ws .^ 2 .- Wp[1] * Wp[2]) ./ (Ws .* (Wp[1] - Wp[2]))
-    minimum(abs.(Wa))
+    Wa = muladd.(-Wp[1], Wp[2], Ws .^ 2) ./ (Ws .* (Wp[1] - Wp[2]))
+    min(abs.(Wa)...)
 end
-toprototype(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real, ftype::Type{Bandstop}) = butterworth_bsfmin(Wp, Ws, Rp, Rs)
-fromprototype(Wp::Real, Wscale::Real, ftype::Type{Lowpass}) = Wp * Wscale
-fromprototype(Wp::Real, Wscale::Real, ftype::Type{Highpass}) = Wp / Wscale
+toprototype(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real, ::Type{Bandstop}) = butterworth_bsfmin(Wp, Ws, Rp, Rs)
+fromprototype(Wp::Real, Wscale::Real, ::Type{Lowpass}) = Wp * Wscale
+fromprototype(Wp::Real, Wscale::Real, ::Type{Highpass}) = Wp / Wscale
 
-function fromprototype(Wp::Tuple{Real,Real}, Wscale::Real, ftype::Type{Bandstop})
-    Wa = zeros(2)
+function fromprototype(Wp::Tuple{Real,Real}, Wscale::Real, ::Type{Bandstop})
     diff = Wp[2] - Wp[1]
     prod = Wp[2] * Wp[1]
-    Wa[1] = (diff + sqrt(diff^2 + 4 * (Wscale^2) * prod)) / (2 * Wscale)
-    Wa[2] = (diff - sqrt(diff^2 + 4 * (Wscale^2) * prod)) / (2 * Wscale)
-    sort!(abs.(Wa))
+    k = sqrt(muladd(4 * (Wscale^2), prod, diff^2))
+    den = 2 * Wscale
+    Wa = (diff + k) / den, (diff - k) / den
+    sort_W(abs.(Wa))
 end
 
-function fromprototype(Wp::Tuple{Real,Real}, Wscale::Real, ftype::Type{Bandpass})
-    Wsc = [-Wscale, Wscale]
-    Wa = -Wsc .* (Wp[2] - Wp[1]) ./ 2 .+ sqrt.(Wsc .^ 2 / 4 * (Wp[2] - Wp[1]) .^ 2 .+ Wp[1] * Wp[2])
-    sort!(abs.(Wa))
+function fromprototype(Wp::Tuple{Real,Real}, Wscale::Real, ::Type{Bandpass})
+    Wsc = (-Wscale, Wscale)
+    diff = Wp[2] - Wp[1]
+    prod = Wp[2] * Wp[1]
+    Wa = muladd.(.-Wsc, diff / 2, sqrt(muladd(Wscale^2 / 4, diff^2, prod)))
+    sort_W(abs.(Wa))
 end
 
 butterworth_order_estimate(Rp::Real, Rs::Real, warp::Real) = (log(db2pow(Rs) - 1) - log(db2pow(Rp) - 1)) / (2 * log(warp))
@@ -94,8 +96,8 @@ function elliptic_order_estimate(Rp::Real, Rs::Real, Wa::Real)
     k = Wa^-1
     (k^2 < 1) || throw(DomainError(k^2, "Selectivity parameter specifies too narrow of a transition width.")) # check if in-bounds (k₁ << k <~ 1)
     (1 - k₁^2 < 1) || throw(DomainError(1 - k₁^2, "Discrimination parameter specifies too deep of a stopband."))
-    K = tuple(ellipk(k^2), ellipk(1 - k^2)) # define the complementary moduli
-    K₁ = tuple(ellipk(k₁^2), ellipk(1 - k₁^2))
+    K = (ellipk(k^2), ellipk(1 - k^2)) # define the complementary moduli
+    K₁ = (ellipk(k₁^2), ellipk(1 - k₁^2))
 
     # other order approach would be using (k₁'/k₁) / (k′/k).
     (K[1] * K₁[2]) / (K[2] * K₁[1])
@@ -116,7 +118,7 @@ function brent(f, x1::T, x2::T) where {T<:AbstractFloat}
     k1 = zero(T)
 
     # first interval calculation.
-    m = a + g * (b - a)
+    m = muladd(g, (b - a), a)
     m1, m2 = m, m
     fm = f(m)
     fm1, fm2 = fm, fm
@@ -133,7 +135,7 @@ function brent(f, x1::T, x2::T) where {T<:AbstractFloat}
         if (abs(k1) > xt) # trial parabolic fit
             r = (m - m1) * (fm - fm2)
             q = (m - m2) * (fm - fm1)
-            p = (m - m2) * q - (m - m1) * r
+            p = muladd((m - m2), q, -(m - m1) * r)
             q = 2 * (q - r)
             if (q > 0)
                 p = -p
@@ -196,29 +198,29 @@ end
 #
 for filt in (:butterworth, :elliptic, :chebyshev)
     @eval begin
-        function $(Symbol(string(filt, "_bsfcost")))(Wx::Real, uselowband::Bool, Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real)
+        function $(Symbol(filt, :_bsfcost))(Wx::Real, uselowband::Bool, Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real)
             # override one of the passband edges with the test frequency.
-            Wpc = uselowband ? tuple(Wx, Wp[2]) : tuple(Wp[1], Wx)
+            Wpc = uselowband ? (Wx, Wp[2]) : (Wp[1], Wx)
 
             # get the new warp frequency.
-            warp = minimum(abs.((Ws .* (Wpc[1] - Wpc[2])) ./ (Ws .^ 2 .- (Wpc[1] * Wpc[2]))))
+            warp = min(abs.((Ws .* (Wpc[1] - Wpc[2])) ./ muladd.(-Wpc[1], Wpc[2], Ws .^ 2))...)
 
             # use the new frequency to determine the filter order.
-            $(Symbol(string(filt, "_order_estimate")))(Rp, Rs, warp)
+            $(Symbol(filt, :_order_estimate))(Rp, Rs, warp)
         end
 
-        function $(Symbol(string(filt, "_bsfmin")))(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real)
+        function $(Symbol(filt, :_bsfmin))(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real)
             # NOTE: the optimization function will adjust the corner frequencies in Wp, returning a new passband tuple.
             Δ = eps(typeof(Wp[1]))^(2 / 3)
-            C₁(w) = $(Symbol(string(filt, "_bsfcost")))(w, true, Wp, Ws, Rp, Rs)
+            C₁(w) = $(Symbol(filt, :_bsfcost))(w, true, Wp, Ws, Rp, Rs)
             p1 = brent(C₁, Wp[1], Ws[1] - Δ)
 
-            C₂(w) = $(Symbol(string(filt, "_bsfcost")))(w, false, tuple(p1, Wp[2]), Ws, Rp, Rs)
+            C₂(w) = $(Symbol(filt, :_bsfcost))(w, false, (p1, Wp[2]), Ws, Rp, Rs)
             p2 = brent(C₂, Ws[2] + Δ, Wp[2])
-            Wadj = tuple(p1, p2)
+            Wadj = (p1, p2)
 
-            Wa = (Ws .* (Wadj[1] - Wadj[2])) ./ (Ws .^ 2 .- (Wadj[1] * Wadj[2]))
-            minimum(abs.(Wa)), Wadj
+            Wa = (Ws .* (p1 - p2)) ./ muladd.(-p1, p2, Ws .^ 2)
+            min(abs.(Wa)...), Wadj
         end
     end
 end
@@ -237,8 +239,8 @@ domain is `:z`, interpretting the input frequencies as normalized from 0 to 1, w
 function buttord(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real; domain::Symbol=:z)
 
     # make sure the band edges are in increasing order.
-    Wps = (Wp[1] > Wp[2]) ? tuple(Wp[2], Wp[1]) : tuple(Wp[1], Wp[2])
-    Wss = (Ws[1] > Ws[2]) ? tuple(Ws[2], Ws[1]) : tuple(Ws[1], Ws[2])
+    Wps = sort_W(Wp)
+    Wss = sort_W(Ws)
 
     # infer filter type based on ordering of edges.
     (Wps[1] < Wss[1]) != (Wps[2] > Wss[2]) && throw(ArgumentError("Pass and stopband edges must be ordered for Bandpass/Bandstop filters."))
@@ -350,7 +352,7 @@ for (fcn, est, filt) in ((:ellipord, :elliptic, "Elliptic (Cauer)"),
             end
             # Lowpass/Highpass prototype xform is same as Butterworth.
             wa = toprototype(Ωp, Ωs, ftype)
-            N = ceil(Int, $(Symbol(string(est, "_order_estimate")))(Rp, Rs, wa))
+            N = ceil(Int, $(Symbol(est, :_order_estimate))(Rp, Rs, wa))
             if (domain == :z)
                 ωn = (2 / π) * atan(Ωp)
             else
@@ -372,19 +374,19 @@ for (fcn, est, filt) in ((:ellipord, :elliptic, "Elliptic (Cauer)"),
         frequencies as normalized from 0 to 1, where 1 corresponds to π radians/sample.
         """
         function $fcn(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real; domain::Symbol=:z)
-            Wps = (Wp[1] > Wp[2]) ? tuple(Wp[2], Wp[1]) : tuple(Wp[1], Wp[2])
-            Wss = (Ws[1] > Ws[2]) ? tuple(Ws[2], Ws[1]) : tuple(Ws[1], Ws[2])
+            Wps = sort_W(Wp)
+            Wss = sort_W(Ws)
             (Wps[1] < Wss[1]) != (Wps[2] > Wss[2]) && throw(ArgumentError("Pass and stopband edges must be ordered for Bandpass/Bandstop filters."))
             ftype = (Wps[1] < Wss[1]) ? Bandstop : Bandpass
             # pre-warp to analog if z-domain.
             (Ωp, Ωs) = (domain == :z) ? (tan.(π / 2 .* Wps), tan.(π / 2 .* Wss)) : (Wps, Wss)
             if (ftype == Bandpass)
-                Wa = (Ωs .^ 2 .- (Ωp[1] * Ωp[2])) ./ (Ωs .* (Ωp[1] - Ωp[2]))
+                Wa = muladd.(-Ωp[1], Ωp[2], Ωs .^ 2) ./ (Ωs .* (Ωp[1] - Ωp[2]))
                 Ωpadj = Ωp
             else
-                (Wa, Ωpadj) = $(Symbol(string(est, "_bsfmin")))(Ωp, Ωs, Rp, Rs) # check scipy.
+                (Wa, Ωpadj) = $(Symbol(est, :_bsfmin))(Ωp, Ωs, Rp, Rs) # check scipy.
             end
-            N = ceil(Int, $(Symbol(string(est, "_order_estimate")))(Rp, Rs, minimum(abs.(Wa))))
+            N = ceil(Int, $(Symbol(est, :_order_estimate))(Rp, Rs, min(abs.(Wa)...)))
             ωn = (domain == :z) ? Wps : Ωpadj
             N, ωn
         end
@@ -431,31 +433,33 @@ order and natural frequencies for an analog filter. The default domain is `:z`,
 frequencies as normalized from 0 to 1, where 1 corresponds to π radians/sample.
 """
 function cheb2ord(Wp::Tuple{Real,Real}, Ws::Tuple{Real,Real}, Rp::Real, Rs::Real; domain::Symbol=:z)
-    Wps = (Wp[1] > Wp[2]) ? tuple(Wp[2], Wp[1]) : tuple(Wp[1], Wp[2])
-    Wss = (Ws[1] > Ws[2]) ? tuple(Ws[2], Ws[1]) : tuple(Ws[1], Ws[2])
+    Wps = sort_W(Wp)
+    Wss = sort_W(Ws)
     (Wps[1] < Wss[1]) != (Wps[2] > Wss[2]) && throw(ArgumentError("Pass and stopband edges must be ordered for Bandpass/Bandstop filters."))
     ftype = (Wps[1] < Wss[1]) ? Bandstop : Bandpass
     (Ωp, Ωs) = (domain == :z) ? (tan.(π / 2 .* Wps), tan.(π / 2 .* Wss)) : (Wps, Wss)
     if (ftype == Bandpass)
-        Wa = (Ωs .^ 2 .- (Ωp[1] * Ωp[2])) ./ (Ωs .* (Ωp[1] - Ωp[2]))
+        prod = Ωp[1] * Ωp[2]
+        diff = Ωp[1] - Ωp[2]
+        Wa = muladd.(Ωs, Ωs, -prod) ./ (Ωs .* diff)
     else
         (Wa, Ωpadj) = chebyshev_bsfmin(Ωp, Ωs, Rp, Rs)
+        prod = Ωpadj[1] * Ωpadj[2]
+        diff = Ωpadj[1] - Ωpadj[2]
     end
-    N = ceil(Int, chebyshev_order_estimate(Rp, Rs, minimum(abs.(Wa))))
+    N = ceil(Int, chebyshev_order_estimate(Rp, Rs, min(abs.(Wa)...)))
 
     # new frequency for stopband spec.
     wnew = 1 / cosh(1 / N * acosh(√(db2pow(Rs) - 1) / √(db2pow(Rp) - 1)))
 
     # lowpass prototype to analog filter re-map.
-    Wna = zeros(2)
     if (ftype == Bandpass)
-        Wna[1] = (Ωp[1] - Ωp[2]) / (2 * wnew) + √((Ωp[2] - Ωp[1])^2 / (4 * wnew^2) + Ωp[1] * Ωp[2])
-        Wna[2] = (Ωp[1] * Ωp[2]) / Wna[1]
+        Wna1 = diff / (2 * wnew) + √(diff^2 / (4 * wnew^2) + prod)
     else
-        Wna[1] = ((Ωpadj[1] - Ωpadj[2]) * wnew) / 2 + √((Ωpadj[1] - Ωpadj[2])^2 * wnew^2 / 4 + (Ωpadj[1] * Ωpadj[2]))
-        Wna[2] = (Ωpadj[1] * Ωpadj[2]) / Wna[1]
+        Wna1 = (diff * wnew) / 2 + √(muladd(diff^2, wnew^2 / 4, prod))
     end
-    ωn = (domain == :z) ? tuple((2 / π) * atan(Wna[1]), (2 / π) * atan(Wna[2])) : tuple(Wna[1], Wna[2])
+    Wna2 = prod / Wna1
+    ωn = (domain == :z) ? ((2 / π) * atan(Wna1), (2 / π) * atan(Wna2)) : (Wna1, Wna2)
     N, ωn
 end
 
@@ -480,9 +484,9 @@ function remezord(Wp::Real, Ws::Real, Rp::Real, Rs::Real)
     (0 > Wp > 0.5) || (0 > Ws > 0.5) && throw(ArgumentError("Pass and stopband edges must be greater than DC and less than Nyquist."))
     L1, L2 = log10(Rp), log10(Rs)
     df = abs(Ws - Wp) # works in HPF case if passband/stopband edges are flipped
-    A = (5.309e-3 * L1^2) + (7.114e-2 * L1) - 0.4761
-    B = (2.66e-3 * L1^2) + (0.5941 * L1) + 0.4278
-    Kf = 0.51244 * (L1 - L2) + 11.01217
-    D = A * L2 - B
-    return ceil(Int, ((D - Kf * df^2) / df))
+    A = muladd(5.309e-3, L1^2, muladd(7.114e-2, L1, -0.4761))
+    B = muladd(2.66e-3, L1^2, muladd(0.5941, L1, 0.4278))
+    Kf = muladd(0.51244, (L1 - L2), 11.01217)
+    D = muladd(A, L2, -B)
+    return ceil(Int, (muladd(-Kf, df^2, D) / df))
 end
diff --git a/src/Filters/remez_fir.jl b/src/Filters/remez_fir.jl
@@ -88,7 +88,7 @@ C CODE BANNER
 # RemezFilterType:
 #    Type I and II symmetric linear phase: neg==0   (filter_type==bandpass)
 #    Type III and IV negative symmetric linear phase: neg==1   (filter_type==hilbert or differentiator)
-@enum RemezFilterType filter_type_bandpass=1 filter_type_differentiator=2 filter_type_hilbert=3
+@enum RemezFilterType filter_type_bandpass filter_type_differentiator filter_type_hilbert
 
 
 
@@ -212,10 +212,10 @@ function freq_eval(xf, x::AbstractVector, y::AbstractVector, ad::AbstractVector)
     d = 0.0
     p = 0.0
 
-    for j = 1:length(ad)
+    for j in eachindex(ad)
         c = ad[j] / (xf - x[j])
         d += c
-        p += c * y[j]
+        p = muladd(c, y[j], p)
     end
 
     p/d
@@ -439,8 +439,8 @@ function remez(numtaps::Integer, band_defs;
 
         #
         # Start next iteration
-        #        
-      @label L100
+        #
+    #   @label L100
         iext[nzz] = ngrid + 1
         niter += 1
         if niter > maxiter
@@ -449,7 +449,9 @@ function remez(numtaps::Integer, band_defs;
             break
         end
 
-        x[1:nz] = grid[iext[1:nz]]
+        for j = 1:nz
+            x[j] = grid[iext[j]]
+        end
 
         for j = 1 : nz
             ad[j] = lagrange_interp(j, nz, jet, x)
@@ -460,8 +462,8 @@ function remez(numtaps::Integer, band_defs;
         k = 1
         for j = 1 : nz
             l = iext[j]
-            dnum += ad[j] * des[l]
-            dden += k * ad[j] / wt[l]
+            dnum = muladd(ad[j], des[l], dnum)
+            dden = muladd(k, ad[j] / wt[l], dden)
             k = -k
         end
         dev = dnum / dden
@@ -495,6 +497,8 @@ function remez(numtaps::Integer, band_defs;
         nut = -nu
         j = 1
 
+        local comp
+
       @label L200
         j == nzz && (ynz = comp)   # equivalent to "if (j == nzz) ynz = comp; end"
         j >= nzz && @goto L300
@@ -604,13 +608,13 @@ function remez(numtaps::Integer, band_defs;
             iext[nzz-j] = iext[nz-j]
         end
         iext[1] = k1
-        @goto L100
+        continue    # @goto L100
       @label L350
         for j = 1:nz
             iext[j] = iext[j+1]
         end
 
-        @goto L100
+        continue    # @goto L100
       @label L370
 
 
@@ -669,9 +673,9 @@ function remez(numtaps::Integer, band_defs;
     for j = 1 : nfcns
         dtemp = 0.0
         for k = 1 : nm1
-            dtemp += a[k+1] * cospi(2 * (j-1) * delf * k)
+            dtemp = muladd(a[k+1], cospi(2 * (j-1) * delf * k), dtemp)
         end
-        alpha[j] = 2.0 * dtemp + a[1]
+        alpha[j] = 2dtemp + a[1]
     end
 
     for j = 2 : nfcns
@@ -680,7 +684,7 @@ function remez(numtaps::Integer, band_defs;
     alpha[1] *= delf
 
     if !full_grid
-        p[1] = 2.0*alpha[nfcns]*bb+alpha[nm1]
+        p[1] = muladd(2alpha[nfcns], bb, alpha[nm1])
         p[2] = 2.0*aa*alpha[nfcns]
         q[1] = alpha[nfcns-2]-alpha[nfcns]
         for j = 2 : nm1
@@ -693,12 +697,12 @@ function remez(numtaps::Integer, band_defs;
                 a[k] = p[k]
                 p[k] = 2.0 * bb * a[k]
             end
-            p[2] += a[1] * 2.0 *aa
+            p[2] = muladd(a[1], 2aa, p[2])
             for k = 1 : j-1
-                p[k] += q[k] + aa * a[k+1]
+                p[k] += muladd(aa, a[k+1], q[k])
             end
             for k = 3 : j+1
-                p[k] += aa * a[k-1]
+                p[k] = muladd(aa, a[k-1], p[k])
             end
 
             if j != nm1
@@ -732,7 +736,7 @@ function remez(numtaps::Integer, band_defs;
             for j = 2 : nm1
                 h[j] = 0.25 * (alpha[nz-j] + alpha[nfcns+2-j])
             end
-            h[nfcns] = 0.5*alpha[1] + 0.25*alpha[2]
+            h[nfcns] = muladd(0.5, alpha[1], 0.25 * alpha[2])
         end
     else
         if nodd
@@ -741,14 +745,14 @@ function remez(numtaps::Integer, band_defs;
             for j = 1 : nm1
                 h[j] = 0.25 * (alpha[nz-j] - alpha[nfcns+3-j])
             end
-            h[nfcns] = 0.5 * alpha[1] - 0.25 * alpha[3]
+            h[nfcns] = muladd(0.5, alpha[1], -0.25 * alpha[3])
             h[nz] = 0.0
         else
             h[1] = 0.25 * alpha[nfcns]
             for j = 2 : nm1
                 h[j] = 0.25 * (alpha[nz-j] - alpha[nfcns+2-j])
             end
-            h[nfcns] = 0.5 * alpha[1] - 0.25 * alpha[2]
+            h[nfcns] = muladd(0.5, alpha[1], -0.25 * alpha[2])
         end
     end
 

diff --git a/src/Filters/response.jl b/src/Filters/response.jl
@@ -39,13 +39,17 @@ _freq(filter::FilterCoefficients) = x::Number -> _freq(filter, x)
 
 _freq(filter::FilterCoefficients, x::Number) =
     _freq(convert(PolynomialRatio, filter), x)
-_freq(filter::PolynomialRatio, x::Number) = filter.b(x) ./ filter.a(x)
-_freq(filter::ZeroPoleGain, x::Number) =
-    filter.k * prod([x - z for z in filter.z]) / prod([x - p for p in filter.p])
+_freq(filter::PolynomialRatio, x::Number) = filter.b(x) / filter.a(x)
+
+_prod_freq(f, v::Vector{<:Union{Z,Biquad{D,Z}}}, ::Type{T}) where {D,T<:Number,Z<:Number} =
+    mapreduce(f, Base.mul_prod, v; init=one(promote_type(T, Z)))
+
+_freq(filter::ZeroPoleGain, x::T) where {T<:Number} =
+    filter.k * _prod_freq(z -> x - z, filter.z, T) / _prod_freq(p -> x - p, filter.p, T)
 _freq(filter::Biquad, x::Number) =
-    ((filter.b0*x + filter.b1)*x + filter.b2) / ((x + filter.a1)*x  + filter.a2)
-_freq(filter::SecondOrderSections, x::Number) =
-    filter.g * prod([_freq(b, x) for b in filter.biquads])
+    muladd(muladd(filter.b0, x, filter.b1), x, filter.b2) / muladd((x + filter.a1), x, filter.a2)
+_freq(filter::SecondOrderSections, x::T) where {T<:Number} =
+    filter.g * _prod_freq(b -> _freq(b, x), filter.biquads, T)
 
 
 """

diff --git a/src/Filters/stream_filt.jl b/src/Filters/stream_filt.jl
@@ -10,7 +10,7 @@ mutable struct FIRStandard{T} <: FIRKernel{T}
 end
 
 function FIRStandard(h::Vector)
-    h    = reverse(h, dims=1)
+    h    = reverse(h)
     hLen = length(h)
     FIRStandard(h, hLen)
 end
@@ -48,7 +48,7 @@ mutable struct FIRDecimator{T} <: FIRKernel{T}
 end
 
 function FIRDecimator(h::Vector, decimation::Integer)
-    h            = reverse(h, dims=1)
+    h            = reverse(h)
     hLen         = length(h)
     inputDeficit = 1
     FIRDecimator(h, hLen, decimation, inputDeficit)
@@ -268,7 +268,7 @@ end
 
 # For FIRFilter, set history vector to zeros of same type and required length
 function reset!(self::FIRFilter)
-    self.history = zeros(eltype(self.history), self.historyLen)
+    fill!(self.history, 0)
     reset!(self.kernel)
     self
 end
@@ -320,7 +320,7 @@ function outputlength(inputlength::Integer, ratio::Union{Integer,Rational}, init
     ceil(Int, outLen)
 end
 
-function outputlength(kernel::FIRStandard, inputlength::Integer)
+function outputlength(::FIRStandard, inputlength::Integer)
     inputlength
 end
 
@@ -357,7 +357,7 @@ function inputlength(outputlength::Int, ratio::Union{Integer,Rational}, initial
     floor(Int, inLen)
 end
 
-function inputlength(kernel::FIRStandard, outputlength::Integer)
+function inputlength(::FIRStandard, outputlength::Integer)
     outputlength
 end
 
@@ -431,16 +431,6 @@ function filt!(buffer::AbstractVector{Tb}, self::FIRFilter{FIRStandard{Th}}, x::
     return xLen
 end
 
-function filt(self::FIRFilter{FIRStandard{Th}}, x::AbstractVector{Tx}) where {Th,Tx}
-    bufLen         = outputlength(self, length(x))
-    buffer         = Vector{promote_type(Th,Tx)}(undef, bufLen)
-    samplesWritten = filt!(buffer, self, x)
-
-    samplesWritten == bufLen || resize!(buffer, samplesWritten)
-
-    return buffer
-end
-
 
 #
 # Interpolation
@@ -482,16 +472,6 @@ function filt!(buffer::AbstractVector{Tb}, self::FIRFilter{FIRInterpolator{Th}},
     return bufIdx
 end
 
-function filt(self::FIRFilter{FIRInterpolator{Th}}, x::AbstractVector{Tx}) where {Th,Tx}
-    bufLen         = outputlength(self, length(x))
-    buffer         = Vector{promote_type(Th,Tx)}(undef, bufLen)
-    samplesWritten = filt!(buffer, self, x)
-
-    samplesWritten == bufLen || resize!(buffer, samplesWritten)
-
-    return buffer
-end
-
 
 #
 # Rational resampling
@@ -538,15 +518,6 @@ function filt!(buffer::AbstractVector{Tb}, self::FIRFilter{FIRRational{Th}}, x::
     return bufIdx
 end
 
-function filt(self::FIRFilter{FIRRational{Th}}, x::AbstractVector{Tx}) where {Th,Tx}
-    bufLen         = outputlength(self, length(x))
-    buffer         = Vector{promote_type(Th,Tx)}(undef, bufLen)
-    samplesWritten = filt!(buffer, self, x)
-
-    samplesWritten == bufLen || resize!(buffer, samplesWritten)
-    return buffer
-end
-
 
 #
 # Decimation
@@ -590,24 +561,14 @@ function filt!(buffer::AbstractVector{Tb}, self::FIRFilter{FIRDecimator{Th}}, x:
     return bufIdx
 end
 
-function filt(self::FIRFilter{FIRDecimator{Th}}, x::AbstractVector{Tx}) where {Th,Tx}
-    bufLen         = outputlength(self, length(x))
-    buffer         = Vector{promote_type(Th,Tx)}(undef, bufLen)
-    samplesWritten = filt!(buffer, self, x)
-
-    samplesWritten == bufLen || resize!(buffer, samplesWritten)
-
-    return buffer
-end
-
 
 #
 # Arbitrary resampling
 #
 # Updates FIRArbitrary state. See Section 7.5.1 in [1].
 #   [1] uses a phase accumilator that increments by Δ (Nϕ/rate)
 
-function update(kernel::FIRArbitrary)
+function update!(kernel::FIRArbitrary)
     kernel.ϕAccumulator += kernel.Δ
 
     if kernel.ϕAccumulator > kernel.Nϕ
@@ -657,8 +618,8 @@ function filt!(
 
         # Used to have @inbounds. Restore @inbounds if buffer length
         # can be verified prior to access.
-        buffer[bufIdx] = yLower + yUpper * kernel.α
-        update(kernel)
+        buffer[bufIdx] = muladd(yUpper, kernel.α, yLower)
+        update!(kernel)
     end
 
     kernel.inputDeficit = kernel.xIdx - xLen
@@ -667,7 +628,7 @@ function filt!(
     return bufIdx
 end
 
-function filt(self::FIRFilter{FIRArbitrary{Th}}, x::AbstractVector{Tx}) where {Th,Tx}
+function filt(self::FIRFilter{Tk}, x::AbstractVector{Tx}) where {Th,Tx,Tk<:FIRKernel{Th}}
     bufLen         = outputlength(self, length(x))
     buffer         = Vector{promote_type(Th,Tx)}(undef, bufLen)
     samplesWritten = filt!(buffer, self, x)

diff --git a/src/dspbase.jl b/src/dspbase.jl
@@ -1,7 +1,5 @@
 # This file was formerly a part of Julia. License is MIT: https://julialang.org/license
 
-import Base: trailingsize
-
 const SMALL_FILT_CUTOFF = 58
 
 _zerosi(b,a,T) = zeros(promote_type(eltype(b), eltype(a), T), max(length(a), length(b))-1)
@@ -39,38 +37,40 @@ function filt!(out::AbstractArray, b::Union{AbstractVector, Number}, a::Union{Ab
     bs = length(b)
     sz = max(as, bs)
     silen = sz - 1
-    ncols = trailingsize(x,2)
+    ncols = size(x, 2)
 
     if size(si, 1) != silen
         throw(ArgumentError("initial state vector si must have max(length(a),length(b))-1 rows"))
-    end
-    if N > 1 && trailingsize(si,2) != ncols
-        throw(ArgumentError("initial state vector si must be a vector or have the same number of columns as x"))
+    elseif N > 1 && size(si, 2) != ncols
+        throw(ArgumentError("initial state si must be a vector or have the same number of columns as x"))
     end
 
-    size(x,1) == 0 && return out
-    sz == 1 && return mul!(out, x, b[1]/a[1]) # Simple scaling without memory
+    iszero(size(x, 1)) && return out
+    isone(sz) && return (k = b[1] / a[1]; Compat.@noinline mul!(out, x, k)) # Simple scaling without memory
 
     # Filter coefficient normalization
-    if a[1] != 1
+    if !isone(a[1])
         norml = a[1]
-        a = a ./ norml
-        b = b ./ norml
+        a = Compat.@noinline broadcast(/, a, norml)
+        b = Compat.@noinline broadcast(/, b, norml)
     end
     # Pad the coefficients with zeros if needed
     bs<sz   && (b = copyto!(zeros(eltype(b), sz), b))
     1<as<sz && (a = copyto!(zeros(eltype(a), sz), a))
 
-    initial_si = si
-    for col = 1:ncols
-        # Reset the filter state
-        si = initial_si[:, N > 1 ? col : 1]
-        if as > 1
-            _filt_iir!(out, b, a, x, si, col)
-        elseif bs <= SMALL_FILT_CUTOFF
-            _small_filt_fir!(out, b, x, si, col)
-        else
-            _filt_fir!(out, b, x, si, col)
+    if as == 1 && bs <= SMALL_FILT_CUTOFF
+        _small_filt_fir!(out, b, x, si, Val(bs))
+    else
+        initial_si = si
+        si = similar(si, axes(si, 1))
+        for col = 1:ncols
+            # Reset the filter state
+            copyto!(si, view(initial_si, :, N > 1 ? col : 1))
+            if as > 1
+                _filt_iir!(out, b, a, x, si, col)
+            else
+                _filt_fir!(out, b, x, si, col)
+            end
         end
     end
     return out
@@ -79,75 +79,72 @@ end
 # Transposed direct form II
 function _filt_iir!(out, b, a, x, si, col)
     silen = length(si)
-    @inbounds for i=1:size(x, 1)
-        xi = x[i,col]
+    @inbounds for i in axes(x, 1)
+        xi = x[i, col]
         val = muladd(xi, b[1], si[1])
+        out[i, col] = val
         for j=1:(silen-1)
             si[j] = muladd(val, -a[j+1], muladd(xi, b[j+1], si[j+1]))
         end
         si[silen] = muladd(xi, b[silen+1], -a[silen+1]*val)
-        out[i,col] = val
     end
 end
 
 # Transposed direct form II
 function _filt_fir!(out, b, x, si, col)
     silen = length(si)
-    @inbounds for i=1:size(x, 1)
-        xi = x[i,col]
-        val = muladd(xi, b[1], si[1])
+    @inbounds for i in axes(x, 1)
+        xi = x[i, col]
+        out[i, col] = muladd(xi, b[1], si[1])
         for j=1:(silen-1)
             si[j] = muladd(xi, b[j+1], si[j+1])
         end
-        si[silen] = b[silen+1]*xi
-        out[i,col] = val
+        si[silen] = b[silen+1] * xi
     end
 end
 
 #
 # filt implementation for FIR filters (faster than Base)
 #
 
-for n = 2:SMALL_FILT_CUTOFF
-    silen = n-1
-    si = [Symbol("si$i") for i = 1:silen]
-    # Transposed direct form II
-    @eval function _filt_fir!(out, b::NTuple{$n,T}, x, siarr, col) where {T}
-        offset = (col - 1) * size(x, 1)
-
-        $(Expr(:block, [:(@inbounds $(si[i]) = siarr[$i]) for i = 1:silen]...))
-        @inbounds for i=1:size(x, 1)
-            xi = x[i+offset]
-            val = muladd(xi, b[1], $(si[1]))
-            $(Expr(:block, [:($(si[j]) = muladd(xi, b[$(j+1)], $(si[j+1]))) for j = 1:(silen-1)]...))
-            $(si[silen]) = b[$(silen+1)]*xi
-            out[i+offset] = val
+# Transposed direct form II
+@generated function _filt_fir!(out, b::NTuple{N,T}, x, siarr, col) where {N,T}
+    silen = N - 1
+    si_end = Symbol(:si_, silen)
+    SMALL_FILT_VECT_CUTOFF = 18
+    si_check = N > SMALL_FILT_VECT_CUTOFF ? :(nothing) : :(@assert length(siarr) == $silen)
+
+    q = quote
+        $si_check
+        Base.@nextract $silen si siarr
+        for i in axes(x, 1)
+            xi = x[i, col]
+            val = muladd(xi, b[1], si_1)
+            Base.@nexprs $(silen-1) j -> (si_j = muladd(xi, b[j+1], si_{j+1}))
+            $si_end = b[N] * xi
+            out[i, col] = val
         end
     end
-end
 
-# Convert array filter tap input to tuple for small-filtering
-let chain = :(throw(ArgumentError("invalid tuple size")))
-    for n = SMALL_FILT_CUTOFF:-1:2
-        chain = quote
-            if length(h) == $n
-                _filt_fir!(
-                    out,
-                    ($([:(@inbounds(h[$i])) for i = 1:n]...),),
-                    x,
-                    si,
-                    col
-                )
-            else
-                $chain
-            end
+    if N > SMALL_FILT_VECT_CUTOFF
+        loop_args = q.args[6].args[2].args
+        for i in (2, 10)
+            loop_args[i] = :(@inbounds $(loop_args[i]))
         end
     end
+    q
+end
 
-    @eval function _small_filt_fir!(
-        out::AbstractArray, h::AbstractVector{T}, x::AbstractArray, si, col
-    ) where T
-        $chain
+# Convert array filter tap input to tuple for small-filtering
+function _small_filt_fir!(
+    out::AbstractArray, h::AbstractVector, x::AbstractArray,
+        si::AbstractArray{S,N}, ::Val{bs}) where {S,N,bs}
+
+    bs < 2 && throw(ArgumentError("invalid tuple size"))
+    b = ntuple(j -> @inbounds(h[j]), Val(bs))
+    for col in axes(x, 2)
+        v_si = view(si, :, N > 1 ? col : 1)
+        _filt_fir!(out, b, x, v_si, col)
     end
 end
 
@@ -310,7 +307,7 @@ end
     buff = similar(u, nffts)
 
     p = plan_fft!(buff)
-    ip = plan_bfft!(buff)
+    ip = inv(p).p
 
     buff, buff, p, ip # Only one buffer for complex
 end
@@ -525,6 +522,7 @@ function unsafe_conv_kern_os!(out,
         lastfull > 1 ? [1:firstfull - 1, lastfull + 1 : nblock] : [1:nblock]
     end
     all_dims = 1:N
+    val_dims = ntuple(Val, Val(N))
     # Buffer to store ranges of indices for a single region of the perimeter
     perimeter_range = Vector{UnitRange{Int}}(undef, N)
 
@@ -540,15 +538,15 @@ function unsafe_conv_kern_os!(out,
     #                         2 | Edges of Cube
     #                         3 | Corners of Cube
     #
-    for n_edges in all_dims
+    for n_edges in val_dims
         unsafe_conv_kern_os_edge!(
             # These arrays and buffers will be mutated
             out,
             tdbuff,
             fdbuff,
             perimeter_range,
             # Number of edge dimensions to pad and convolve
-            Val(n_edges),
+            n_edges,
             # Data to be convolved
             u,
             filter_fd,
@@ -628,10 +626,11 @@ function _conv_kern_fft!(out, u, v, su, sv, outsize, nffts)
     upad = _zeropad(u, nffts)
     vpad = _zeropad(v, nffts)
     p! = plan_fft!(upad)
+    ip! = inv(p!)
     p! * upad # Operates in place on upad
     p! * vpad
     upad .*= vpad
-    ifft!(upad)
+    ip! * upad
     copyto!(out,
             CartesianIndices(out),
             upad,

diff --git a/test/FilterTestHelpers.jl b/test/FilterTestHelpers.jl
@@ -63,7 +63,7 @@ function zpkfilter_accuracy(f1, f2, accurate_f; relerr=1, compare_gain_at=nothin
     z2, p2 = sort(f2.z, lt=lt), sort(f2.p, lt=lt)
     accurate_z, accurate_p = sort(accurate_f.z, lt=lt), sort(accurate_f.p, lt=lt)
     if !isempty(z1) || !isempty(z2) || !isempty(accurate_z)
-        if eps != nothing
+        if eps !== nothing
             @test ≈(z1, accurate_z, atol=eps)
             @test ≈(z2, accurate_z, atol=eps)
         else
@@ -72,7 +72,7 @@ function zpkfilter_accuracy(f1, f2, accurate_f; relerr=1, compare_gain_at=nothin
         end
         accuracy_check(loss(z1, accurate_z), loss(z2, accurate_z), "z", relerr)
     end
-    if eps != nothing
+    if eps !== nothing
         @test ≈(p1, accurate_p, atol=eps)
         @test ≈(p2, accurate_p, atol=eps)
         @test ≈(f1.k, accurate_f.k, atol=eps)

diff --git a/test/dsp.jl b/test/dsp.jl
@@ -179,16 +179,16 @@ end
     end
 
     @testset "Overlap-Save" begin
-        to_sizetuple = (diml, N) -> ntuple(_ -> diml, N)
-        function os_test_data(eltype, diml, N)
+        to_sizetuple(diml, N) = ntuple(_ -> diml, N)
+        function os_test_data(::Type{T}, diml, N) where T
             s = to_sizetuple(diml, N)
-            arr = rand(eltype, s)
+            arr = rand(T, s)
             s, arr
         end
-        function test_os(eltype, nu, nv, N, nfft)
+        function test_os(::Type{T}, nu, nv, N, nfft) where T
             nffts = to_sizetuple(nfft, N)
-            su, u = os_test_data(eltype, nu, N)
-            sv, v = os_test_data(eltype, nv, N)
+            su, u = os_test_data(T, nu, N)
+            sv, v = os_test_data(T, nv, N)
             sout = su .+ sv .- 1
             out = _conv_similar(u, sout, axes(u), axes(v))
             unsafe_conv_kern_os!(out, u, v, su, sv, sout, nffts)
@@ -202,13 +202,9 @@ end
         nlarge = 128
 
         regular_nsmall = [12, 128]
-        for numdim in Ns
-            for elt in eltypes
-                for nsmall in regular_nsmall
-                    nfft = optimalfftfiltlength(nsmall, nlarge)
-                    test_os(elt, nlarge, nsmall, Val{numdim}(), nfft)
-                end
-            end
+        for numdim in Ns, elt in eltypes, nsmall in regular_nsmall
+            nfft = optimalfftfiltlength(nsmall, nlarge)
+            test_os(elt, nlarge, nsmall, Val{numdim}(), nfft)
         end
 
         # small = 12, fft = 256 exercises output being smaller than the normal
@@ -266,6 +262,8 @@ end
 
     # Issue #288
     @test xcorr(off_a, off_b, padmode = :longest) == OffsetVector(vcat(0, exp), -3:1)
+
+    @test_throws ArgumentError xcorr([1], [2]; padmode=:bug)
 end
 
 @testset "deconv" begin

diff --git a/test/filt_order.jl b/test/filt_order.jl
@@ -50,13 +50,13 @@ using DSP, Test
     # [n, Wn] = buttord(Wp, Ws, Rp, Rs)
     #
 
-    (nbp, Wnbp) = buttord(tuple(100 / 500, 200 / 500), tuple(50 / 500, 250 / 500), 3, 40, domain=:z)
+    (nbp, Wnbp) = buttord((100 / 500, 200 / 500), (50 / 500, 250 / 500), 3, 40, domain=:z)
     @test nbp == 8
     @test Wnbp[1] ≈ 0.195101359239
     @test Wnbp[2] ≈ 0.408043633382
 
     # s-domain
-    (nbps, Wnbps) = buttord(tuple(100 / 500, 200 / 500), tuple(50 / 500, 250 / 500), 3, 40, domain=:s)
+    (nbps, Wnbps) = buttord((100 / 500, 200 / 500), (50 / 500, 250 / 500), 3, 40, domain=:s)
     @test nbps == 9
     @test Wnbps[1] ≈ 0.198730150231
     @test Wnbps[2] ≈ 0.402555927759
@@ -70,13 +70,13 @@ using DSP, Test
 
     # this test may be more sensitive...MATLAB's implementation of bounded minimization
     # will yield different results in comparison to Optim.jl.
-    (nbs, Wnbs) = buttord(tuple(3200 / 22050, 7800 / 22050), tuple(4800 / 22050, 5600 / 22050), 2, 60, domain=:z)
+    (nbs, Wnbs) = buttord((3200 / 22050, 7800 / 22050), (4800 / 22050, 5600 / 22050), 2, 60, domain=:z)
     @test nbs == 5
     @test ≈(Wnbs[1], 0.172660908966, rtol=Δ)
     @test ≈(Wnbs[2], 0.314956388749, rtol=Δ)
 
     # s-domain
-    (nbss, Wnbss) = buttord(tuple(3200 / 22050, 7800 / 22050), tuple(4800 / 22050, 5600 / 22050), 2, 60, domain=:s)
+    (nbss, Wnbss) = buttord((3200 / 22050, 7800 / 22050), (4800 / 22050, 5600 / 22050), 2, 60, domain=:s)
     @test ≈(Wnbss[1], 0.173677826752, rtol=Δ)
     @test ≈(Wnbss[2], 0.318267164272, rtol=Δ)
 
@@ -88,8 +88,8 @@ end
 @testset "ellipord" begin
 
     Rp, Rs = 3, 40
-    Wp = tuple(0.2, 0.7)
-    Ws = tuple(0.1, 0.8)
+    Wp = (0.2, 0.7)
+    Ws = (0.1, 0.8)
 
     # Lowpass
     (nl, Wnl) = ellipord(0.1, 0.2, Rp, Rs, domain=:z)
@@ -127,8 +127,8 @@ end
 
 @testset "cheb1ord" begin
     Rp, Rs = 2, 70
-    Wp = tuple(0.2, 0.5)
-    Ws = tuple(0.1, 0.6)
+    Wp = (0.2, 0.5)
+    Ws = (0.1, 0.6)
 
     # Lowpass
     (nl, Wnl) = cheb1ord(0.1, 0.21, Rp, Rs, domain=:z)
@@ -165,8 +165,8 @@ end
 
 @testset "cheb2ord" begin
     Rp, Rs = 1.2, 80
-    Wp = tuple(0.22, 0.51)
-    Ws = tuple(0.14, 0.63)
+    Wp = (0.22, 0.51)
+    Ws = (0.14, 0.63)
 
     # Lowpass
     (nl, Wnl) = cheb2ord(0.1, 0.21, Rp, Rs, domain=:z)

diff --git a/test/filt_stream.jl b/test/filt_stream.jl
@@ -68,7 +68,7 @@ end
 # Single rate filtering
 #
 
-function test_singlerate(h, x)
+function test_singlerate(h::AbstractVector{T}, x::AbstractVector) where T
     xLen       = length(x)
     hLen       = length(h)
     pivotPoint = min(rand(50:150), div(xLen, 4))
@@ -81,8 +81,8 @@ function test_singlerate(h, x)
     @printfifinteractive( "___] | | \\| |__] |___ |___    |  \\ |  |  |  |___\n" )
     @printfifinteractive( "\nTesting single-rate fitering, h is %s, x is %s. xLen = %d, hLen = %d", string(eltype(h)), string(eltype(x)), xLen, hLen )
 
-    @printfifinteractive( "\n\tBase.filt\n\t\t")
-    @timeifinteractive naiveResult = filt(h, 1.0, x)
+    @printfifinteractive( "\n\tfilt\n\t\t")
+    @timeifinteractive naiveResult = filt(h, one(T), x)
 
     @printfifinteractive( "\n\tDSP.filt( h, x, 1//1 )\n\t\t" )
     @timeifinteractive statelesResult = DSP.filt( h, x )