From 79fc288505d0bf435473337bb5d1ee64ebe58980 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Tue, 4 Jun 2019 16:53:22 -0400
Subject: [PATCH 01/19] Convert arguments to FIRArbitrary to expected types
 (#304)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The parameters `Nϕ` and `rate` were not converted to the expected types prior to
constructing `FIRArbitrary` objects.

Fixes #302
---
 src/Filters/stream_filt.jl | 19 +++++++++++++++++--
 test/resample.jl           | 10 ++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/src/Filters/stream_filt.jl b/src/Filters/stream_filt.jl
index bbddc79a8..617649a04 100644
--- a/src/Filters/stream_filt.jl
+++ b/src/Filters/stream_filt.jl
@@ -105,7 +105,9 @@ mutable struct FIRArbitrary{T} <: FIRKernel{T}
     hLen::Int
 end
 
-function FIRArbitrary(h::Vector, rate::Real, Nϕ::Integer)
+function FIRArbitrary(h::Vector, rate_in::Real, Nϕ_in::Integer)
+    rate         = convert(Float64, rate_in)
+    Nϕ           = convert(Int, Nϕ_in)
     dh           = [diff(h); zero(eltype(h))]
     pfb          = taps2pfb(h,  Nϕ)
     dpfb         = taps2pfb(dh, Nϕ)
@@ -117,7 +119,20 @@ function FIRArbitrary(h::Vector, rate::Real, Nϕ::Integer)
     inputDeficit = 1
     xIdx         = 1
     hLen         = length(h)
-    FIRArbitrary(rate, pfb, dpfb, Nϕ, tapsPerϕ, ϕAccumulator, ϕIdx, α, Δ, inputDeficit, xIdx, hLen)
+    FIRArbitrary(
+        rate,
+        pfb,
+        dpfb,
+        Nϕ,
+        tapsPerϕ,
+        ϕAccumulator,
+        ϕIdx,
+        α,
+        Δ,
+        inputDeficit,
+        xIdx,
+        hLen
+    )
 end
 
 
diff --git a/test/resample.jl b/test/resample.jl
index 2a297518e..4faf601a5 100644
--- a/test/resample.jl
+++ b/test/resample.jl
@@ -59,6 +59,16 @@ end
     idxUpper = idxLower*2
     yDelta   = abs.(y[idxLower:idxUpper].-yy[idxLower:idxUpper])
     @test all(map(delta -> abs(delta) < 0.005, yDelta))
+
+    # Test Float32 ratio (#302)
+    f32_ratio = convert(Float32, ratio)
+    f32_y     = resample(x, f32_ratio)
+    ty        = range(0, stop =cycles, length =yLen)
+    yy        = sinpi.(2*ty)
+    idxLower  = round(Int, yLen/3)
+    idxUpper  = idxLower*2
+    yDelta    = abs.(f32_y[idxLower:idxUpper].-yy[idxLower:idxUpper])
+    @test all(map(delta -> abs(delta) < 0.005, yDelta))
 end
 
 @testset "resample_filter" begin

From c3c5b029e404d20d8f9ad4428906239d414335f2 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Tue, 4 Jun 2019 19:32:33 -0400
Subject: [PATCH 02/19] Allow cross correlations to work with arbitrary offsets
 and padding (#305)

* Allow cross correlations to work with arbitrary offsets and padding

Cross-correlation of input arrays with arbitrary indexing did not work in
conjunction with padding, because the padding did not preserve the indexing.
Adding a zero-padding function that preserves indexing, `_zeropad_keep_offset`,
fixes this problem. The `_zeropad` function works the same as before, and
returns an array with base-1 indexing. I have added `_zeropad!` methods that
also work with arbitrary indexing for the padded array, which are dispatched
based on the axes of the of padded array.

Fixes #288
---
 src/dspbase.jl | 62 ++++++++++++++++++++++++++++++++++++++++++++------
 test/dsp.jl    |  3 +++
 2 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/src/dspbase.jl b/src/dspbase.jl
index 5032b73cc..4ecadd3b4 100644
--- a/src/dspbase.jl
+++ b/src/dspbase.jl
@@ -170,20 +170,37 @@ function deconv(b::StridedVector{T}, a::StridedVector{T}) where T
     filt(b, a, x)
 end
 
-# padded must start at index 1, but u can have arbitrary offset
-function _zeropad!(padded::AbstractVector, u::AbstractVector)
+function _zeropad!(
+    padded::AbstractVector, u::AbstractVector, ::Tuple{Base.OneTo{Int}}
+)
     datasize = length(u)
     # Use axes to accommodate arrays that do not start at index 1
     padsize = length(padded)
     data_first_i = first(axes(u, 1))
     copyto!(padded, 1, u, data_first_i, datasize)
     padded[1 + datasize:padsize] .= 0
+
     padded
 end
 
-# padded must start at index 1, but u can have arbitrary offset
-function _zeropad!(padded::AbstractArray{<:Any, N},
-                   u::AbstractArray{<:Any, N}) where N
+function _zeropad!(
+    padded::AbstractVector, u::AbstractVector, pad_axes
+)
+    datasize = length(u)
+    # Use axes to accommodate arrays that do not start at index 1
+    data_first_i = first(axes(u, 1))
+    pad_first_i = first(pad_axes[1])
+    copyto!(padded, pad_first_i, u, data_first_i, datasize)
+    padded[pad_first_i + datasize : last(pad_axes[1])] .= 0
+
+    padded
+end
+
+function _zeropad!(
+    padded::AbstractArray{<:Any, N},
+    u::AbstractArray{<:Any, N},
+    ::NTuple{<:Any, Base.OneTo{Int}}
+) where N
     # Copy the data to the beginning of the padded array
     fill!(padded, zero(eltype(padded)))
     pad_data_ranges = UnitRange.(1, size(u))
@@ -191,10 +208,41 @@ function _zeropad!(padded::AbstractArray{<:Any, N},
 
     padded
 end
+
+function _zeropad!(
+    padded::AbstractArray{<:Any, N},
+    u::AbstractArray{<:Any, N},
+    pad_axes
+) where N
+    # Copy the data to the beginning of the padded array
+    fill!(padded, zero(eltype(padded)))
+    pad_first_i = first.(pad_axes)
+    pad_data_ranges = UnitRange.(pad_first_i, pad_first_i .+ size(u) .- 1)
+    copyto!(padded, CartesianIndices(pad_data_ranges), u, CartesianIndices(u))
+
+    padded
+end
+
+_zeropad!(padded, u) = _zeropad!(padded, u, axes(padded))
+
 function _zeropad(u, padded_size)
     _zeropad!(similar(u, padded_size), u)
 end
 
+function _zeropad_keep_offset(u, padded_size, ::NTuple{<:Any, Base.OneTo{Int}})
+    _zeropad(u, padded_size)
+end
+
+function _zeropad_keep_offset(u, padded_size, axes_u)
+    ax_starts = first.(axes_u)
+    new_axes = UnitRange.(ax_starts, ax_starts .+ padded_size .- 1)
+    _zeropad!(similar(u, new_axes), u, new_axes)
+end
+
+_zeropad_keep_offset(u, padded_size) = _zeropad_keep_offset(
+    u, padded_size, axes(u)
+)
+
 function _conv(
     u::AbstractArray{T, N}, v::AbstractArray{T, N}, paddims
 ) where {T<:Real, N}
@@ -357,9 +405,9 @@ function xcorr(
     padmode = check_padmode_kwarg(padmode, su, sv)
     if padmode == :longest
         if su < sv
-            u = _zeropad(u, sv)
+            u = _zeropad_keep_offset(u, sv)
         elseif sv < su
-            v = _zeropad(v, su)
+            v = _zeropad_keep_offset(v, su)
         end
         conv(u, dsp_reverse(conj(v), axes(v)))
     elseif padmode == :none
diff --git a/test/dsp.jl b/test/dsp.jl
index c6bef9b8e..4522fe0ec 100644
--- a/test/dsp.jl
+++ b/test/dsp.jl
@@ -205,6 +205,9 @@ end
     off_a = OffsetVector(a, -1:1)
     off_b = OffsetVector(b, 0:1)
     @test xcorr(off_a, off_b, padmode = :none) == OffsetVector(exp, -2:1)
+
+    # Issue #288
+    @test xcorr(off_a, off_b, padmode = :longest) == OffsetVector(vcat(0, exp), -3:1)
 end
 
 @testset "deconv" begin

From e07dc102bcc883bdbee99257ebf62eef1f6e643c Mon Sep 17 00:00:00 2001
From: Alex Arslan <ararslan@comcast.net>
Date: Sat, 10 Aug 2019 10:27:31 -0700
Subject: [PATCH 03/19] Drop Julia 0.7 and Compat dependency (#314)

We can also delete the REQUIRE files since they aren't used.
---
 .gitignore   | 2 ++
 .travis.yml  | 1 -
 Project.toml | 4 +---
 REQUIRE      | 6 ------
 src/DSP.jl   | 3 ---
 test/REQUIRE | 1 -
 6 files changed, 3 insertions(+), 14 deletions(-)
 delete mode 100644 REQUIRE
 delete mode 100644 test/REQUIRE

diff --git a/.gitignore b/.gitignore
index 05461b3b2..fedd299ab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,5 @@
 /usr
 
 /docs/build
+
+Manifest.toml
diff --git a/.travis.yml b/.travis.yml
index b909c3557..ca08076be 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,7 +2,6 @@ language: julia
 os:
   - linux
 julia:
-  - 0.7
   - 1.0
   - 1.1
   - nightly
diff --git a/Project.toml b/Project.toml
index d914c404b..d6526c658 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,7 +3,6 @@ uuid = "717857b8-e6f2-59f4-9121-6e50c889abd2"
 version = "0.5.2"
 
 [deps]
-Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Polynomials = "f27b6e38-b328-58d1-80ce-0feddd5e7a45"
@@ -13,9 +12,8 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-Compat = ">= 1.3.0"
 Polynomials = ">= 0.1.0"
-julia = "0.7,1"
+julia = "1"
 
 [extras]
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
diff --git a/REQUIRE b/REQUIRE
deleted file mode 100644
index dcf50b192..000000000
--- a/REQUIRE
+++ /dev/null
@@ -1,6 +0,0 @@
-julia 0.7
-Polynomials 0.1.0
-Reexport
-SpecialFunctions
-FFTW
-Compat 1.3
diff --git a/src/DSP.jl b/src/DSP.jl
index 581a30a7a..57e68ba58 100644
--- a/src/DSP.jl
+++ b/src/DSP.jl
@@ -1,10 +1,7 @@
-__precompile__()
-
 module DSP
 
 using FFTW
 using LinearAlgebra: mul!, rmul!
-using Compat: range
 
 export conv, conv2, deconv, filt, filt!, xcorr
 
diff --git a/test/REQUIRE b/test/REQUIRE
deleted file mode 100644
index 4d5b425cf..000000000
--- a/test/REQUIRE
+++ /dev/null
@@ -1 +0,0 @@
-OffsetArrays
\ No newline at end of file

From 1ca3c74582a4ea6082609993539b1b8ee2db5ea0 Mon Sep 17 00:00:00 2001
From: Alex Arslan <ararslan@comcast.net>
Date: Mon, 12 Aug 2019 15:30:49 -0700
Subject: [PATCH 04/19] Set version to 0.6.0 (#315)

In preparation for a new release.
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index d6526c658..268398633 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "DSP"
 uuid = "717857b8-e6f2-59f4-9121-6e50c889abd2"
-version = "0.5.2"
+version = "0.6.0"
 
 [deps]
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"

From ce1e15c9b703302c39133db60835bbfc31ec3b0e Mon Sep 17 00:00:00 2001
From: Alex Arslan <ararslan@comcast.net>
Date: Sat, 9 Nov 2019 16:38:48 -0800
Subject: [PATCH 05/19] Accommodate AbstractFFTs PR 26 (#322)

Bound FFTW to 1.1 to avoid conflicting export of Frequencies et al., set
verion to 0.6.1.
---
 Project.toml        |  8 ++++++--
 docs/src/util.md    |  2 --
 src/deprecated.jl   |  4 ++++
 src/periodograms.jl |  7 ++++---
 src/util.jl         | 47 ---------------------------------------------
 test/runtests.jl    |  1 +
 test/util.jl        | 21 --------------------
 7 files changed, 15 insertions(+), 75 deletions(-)

diff --git a/Project.toml b/Project.toml
index 268398633..e2c62fb41 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "DSP"
 uuid = "717857b8-e6f2-59f4-9121-6e50c889abd2"
-version = "0.6.0"
+version = "0.6.1"
 
 [deps]
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
@@ -12,7 +12,11 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
-Polynomials = ">= 0.1.0"
+FFTW = "1.1"
+OffsetArrays = "0.11"
+Polynomials = "0.6"
+Reexport = "0.2"
+SpecialFunctions = "0.8"
 julia = "1"
 
 [extras]
diff --git a/docs/src/util.md b/docs/src/util.md
index 2f41a8fc6..d7e74def7 100644
--- a/docs/src/util.md
+++ b/docs/src/util.md
@@ -9,8 +9,6 @@ end
 unwrap
 unwrap!
 hilbert
-fftfreq
-rfftfreq
 nextfastfft
 pow2db
 amp2db
diff --git a/src/deprecated.jl b/src/deprecated.jl
index bea953778..c3ddc2aa5 100644
--- a/src/deprecated.jl
+++ b/src/deprecated.jl
@@ -4,4 +4,8 @@ Base.@deprecate_binding TFFilter PolynomialRatio
 Base.@deprecate_binding BiquadFilter Biquad
 Base.@deprecate_binding SOSFilter SecondOrderSections
 
+Base.@deprecate Frequencies(nreal::Int, n::Int, multiplier::Float64) FFTW.Frequencies(nreal, n, multiplier)
+Base.@deprecate fftfreq(n::Int, fs::Real=1) FFTW.fftfreq(n, fs)
+Base.@deprecate rfftfreq(n::Int, fs::Real=1) FFTW.rfftfreq(n, fs)
+
 @deprecate conv2 conv
diff --git a/src/periodograms.jl b/src/periodograms.jl
index f4dad9e13..beed3baa2 100644
--- a/src/periodograms.jl
+++ b/src/periodograms.jl
@@ -6,6 +6,7 @@ using ..Util, ..Windows
 export arraysplit, nextfastfft, periodogram, welch_pgram, mt_pgram,
        spectrogram, power, freq, stft
 using FFTW
+import FFTW: Frequencies, fftfreq, rfftfreq
 
 ## ARRAY SPLITTER
 
@@ -218,11 +219,11 @@ See also: [`fftfreq`](@ref), [`rfftfreq`](@ref)
 freq(p::TFR) = p.freq
 freq(p::Periodogram2) = (p.freq1, p.freq2)
 FFTW.fftshift(p::Periodogram{T,F}) where {T,F<:Frequencies} =
-    Periodogram(p.freq.nreal == p.freq.n ? p.power : fftshift(p.power), fftshift(p.freq))
+    Periodogram(p.freq.n_nonnegative == p.freq.n ? p.power : fftshift(p.power), fftshift(p.freq))
 FFTW.fftshift(p::Periodogram{T,F}) where {T,F<:AbstractRange} = p
 # 2-d
 FFTW.fftshift(p::Periodogram2{T,F1,F2}) where {T,F1<:Frequencies,F2<:Frequencies} =
-    Periodogram2(p.freq1.nreal == p.freq1.n ? fftshift(p.power,2) : fftshift(p.power), fftshift(p.freq1), fftshift(p.freq2))
+    Periodogram2(p.freq1.n_nonnegative == p.freq1.n ? fftshift(p.power,2) : fftshift(p.power), fftshift(p.freq1), fftshift(p.freq2))
 FFTW.fftshift(p::Periodogram2{T,F1,F2}) where {T,F1<:AbstractRange,F2<:Frequencies} =
     Periodogram2(fftshift(p.power,2), p.freq1, fftshift(p.freq2))
 FFTW.fftshift(p::Periodogram2{T,F1,F2}) where {T,F1<:AbstractRange,F2<:AbstractRange} = p
@@ -442,7 +443,7 @@ struct Spectrogram{T,F<:Union{Frequencies,AbstractRange}} <: TFR{T}
     time::FloatRange{Float64}
 end
 FFTW.fftshift(p::Spectrogram{T,F}) where {T,F<:Frequencies} =
-    Spectrogram(p.freq.nreal == p.freq.n ? p.power : fftshift(p.power, 1), fftshift(p.freq), p.time)
+    Spectrogram(p.freq.n_nonnegative == p.freq.n ? p.power : fftshift(p.power, 1), fftshift(p.freq), p.time)
 FFTW.fftshift(p::Spectrogram{T,F}) where {T,F<:AbstractRange} = p
 
 """
diff --git a/src/util.jl b/src/util.jl
index 21a3b6a30..b62a61a25 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -108,53 +108,6 @@ fftabs2type(::Type{Complex{T}}) where {T<:FFTW.fftwReal} = T
 fftabs2type(::Type{T}) where {T<:FFTW.fftwReal} = T
 fftabs2type(::Type{T}) where {T<:Union{Real,Complex}} = Float64
 
-## FREQUENCY VECTOR
-
-struct Frequencies <: AbstractVector{Float64}
-    nreal::Int
-    n::Int
-    multiplier::Float64
-end
-
-unsafe_getindex(x::Frequencies, i::Int) =
-    (i-1+ifelse(i <= x.nreal, 0, -x.n))*x.multiplier
-function Base.getindex(x::Frequencies, i::Int)
-    (i >= 1 && i <= x.n) || throw(BoundsError())
-    unsafe_getindex(x, i)
-end
-if isdefined(Base, :iterate)
-    function Base.iterate(x::Frequencies, i::Int=1)
-        i > x.n ? nothing : (unsafe_getindex(x,i), i + 1)
-    end
-else
-    Base.start(x::Frequencies) = 1
-    Base.next(x::Frequencies, i::Int) = (unsafe_getindex(x, i), i+1)
-    Base.done(x::Frequencies, i::Int) = i > x.n
-end
-Base.size(x::Frequencies) = (x.n,)
-Base.step(x::Frequencies) = x.multiplier
-
-"""
-    fftfreq(n, fs=1)
-
-Return discrete fourier transform sample frequencies. The returned
-Frequencies object is an AbstractVector containing the frequency
-bin centers at every sample point. `fs` is the sample rate of the
-input signal.
-"""
-fftfreq(n::Int, fs::Real=1) = Frequencies(((n-1) >> 1)+1, n, fs/n)
-
-"""
-    rfftfreq(n, fs=1)
-
-Return discrete fourier transform sample frequencies for use with
-`rfft`. The returned Frequencies object is an AbstractVector
-containing the frequency bin centers at every sample point. `fs`
-is the sample rate of the input signal.
-"""
-rfftfreq(n::Int, fs::Real=1) = Frequencies((n >> 1)+1, (n >> 1)+1, fs/n)
-FFTW.fftshift(x::Frequencies) = (x.nreal-x.n:x.nreal-1)*x.multiplier
-
 # Get next fast FFT size for a given signal length
 const FAST_FFT_SIZES = [2, 3, 5, 7]
 """
diff --git a/test/runtests.jl b/test/runtests.jl
index d418e2750..74af5b3fb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,4 +1,5 @@
 using  DSP, FFTW, Test
+import DSP: Frequencies, fftfreq, rfftfreq
 
 using Random: seed!
 
diff --git a/test/util.jl b/test/util.jl
index 44884a468..73af99f72 100644
--- a/test/util.jl
+++ b/test/util.jl
@@ -50,27 +50,6 @@ using Statistics: mean
 end
 
 @testset "fft helpers" begin
-    ## FFTFREQ
-    @test fftfreq(1) ≈ [0.]
-    @test fftfreq(2) ≈ [0., -1/2]
-    @test fftfreq(2, 1/2) ≈ [0., -1/4]
-    @test fftfreq(3) ≈ [0., 1/3, -1/3]
-    @test fftfreq(3, 1/2) ≈ [0., 1/6, -1/6]
-    @test fftfreq(6) ≈ [0., 1/6, 1/3, -1/2, -1/3, -1/6]
-    @test fftfreq(7) ≈ [0., 1/7, 2/7, 3/7, -3/7, -2/7, -1/7]
-
-    @test rfftfreq(1) ≈ [0.]
-    @test rfftfreq(2) ≈ [0., 1/2]
-    @test rfftfreq(2, 1/2) ≈ [0., 1/4]
-    @test rfftfreq(3) ≈ [0., 1/3]
-    @test rfftfreq(3, 1/2) ≈ [0., 1/6]
-    @test rfftfreq(6) ≈ [0., 1/6, 1/3, 1/2]
-    @test rfftfreq(7) ≈ [0., 1/7, 2/7, 3/7]
-
-    for n = 1:7
-        @test fftshift(fftfreq(n)) ≈ fftshift([fftfreq(n);])
-    end
-
     @test meanfreq(sin.(2*π*10*(0:1e-3:10*π)),1e3) ≈ 10.0 rtol=1e-3
 
     # nextfastfft

From 38622b7600f8e4fd62334244881cc16f6e08e5f0 Mon Sep 17 00:00:00 2001
From: Brian Hawkins <Brian.P.Hawkins@jpl.nasa.gov>
Date: Thu, 5 Sep 2019 16:24:34 -0700
Subject: [PATCH 06/19] Fix filtfilt so it works for real coefficents and
 complex data.

---
 src/Filters/filt.jl |  4 ++--
 test/filt.jl        | 13 +++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/Filters/filt.jl b/src/Filters/filt.jl
index dfe20c85b..153048f8b 100644
--- a/src/Filters/filt.jl
+++ b/src/Filters/filt.jl
@@ -237,9 +237,9 @@ end
 # Zero phase digital filtering by processing data in forward and reverse direction
 function iir_filtfilt(b::AbstractVector, a::AbstractVector, x::AbstractArray)
     zi = filt_stepstate(b, a)
-    zitmp = copy(zi)
     pad_length = 3 * (max(length(a), length(b)) - 1)
     t = Base.promote_eltype(b, a, x)
+    zitmp = similar(zi, t)
     extrapolated = Vector{t}(undef, size(x, 1)+pad_length*2)
     out = similar(x, t)
 
@@ -312,9 +312,9 @@ end
 # Zero phase digital filtering for second order sections
 function filtfilt(f::SecondOrderSections{T,G}, x::AbstractArray{S}) where {T,G,S}
     zi = filt_stepstate(f)
-    zitmp = similar(zi)
     pad_length = 6 * length(f.biquads)
     t = Base.promote_type(T, G, S)
+    zitmp = similar(zi, t)
     extrapolated = Vector{t}(undef, size(x, 1)+pad_length*2)
     out = similar(x, t)
 
diff --git a/test/filt.jl b/test/filt.jl
index 1f237cb78..e04b360bc 100644
--- a/test/filt.jl
+++ b/test/filt.jl
@@ -152,6 +152,19 @@ end
     @test x2_matlab ≈ filtfilt(b, a, x)
 end
 
+# Make sure above doesn't crash for real coeffs & complex data.
+@testset "filtfilt 1D Complex" begin
+    x2_matlab = readdlm(joinpath(dirname(@__FILE__), "data", "filtfilt_output.txt"),'\t')
+
+    b = [ 0.00327922,  0.01639608,  0.03279216,  0.03279216,  0.01639608,  0.00327922]
+    a = [ 1.        , -2.47441617,  2.81100631, -1.70377224,  0.54443269, -0.07231567]
+    x  = readdlm(joinpath(dirname(@__FILE__), "data", "spectrogram_x.txt"),'\t')
+
+    y = x .+ 1im .* randn(size(x, 1))
+
+    @test x2_matlab ≈ real.(filtfilt(b, a, y))
+end
+
 #######################################
 #
 # Test 2d filtfilt against matlab results

From 1bf3061b7257fe9efb2711004efa8ed2cddb878c Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Sun, 24 Nov 2019 09:21:55 -0500
Subject: [PATCH 07/19] Fix CI for building docs

Docs were failing because the Documenter.jl version used was uncontrolled, and
the interface for specifying HTML output has changed in recent version of
Documenter.jl. I have specified which version of Documenter.jl to use as the
current version, and updated our usage of `makedocs` accordingly.
---
 docs/Project.toml | 3 +++
 docs/make.jl      | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/Project.toml b/docs/Project.toml
index cc17d9b3a..65d072b4e 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,3 +1,6 @@
 [deps]
 DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+
+[compat]
+Documenter = "0.24"
diff --git a/docs/make.jl b/docs/make.jl
index d58ef4396..409adec6a 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,7 +2,7 @@ using Documenter, DSP
 
 makedocs(
     modules = [DSP],
-    format = :html,
+    format = Documenter.HTML(),
     sitename = "DSP.jl",
     pages = Any[
         "Contents" => "contents.md",

From 9e1cb9062a1eb7967693add8812b34923c7b2832 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Sat, 23 Nov 2019 10:17:03 -0500
Subject: [PATCH 08/19] Remove polyfit from Util submodule

`polyfit` is exported from, but not defined by, the Util submodule. This commit
removes the export.

Fixes #319
---
 src/util.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/util.jl b/src/util.jl
index b62a61a25..c08803f6d 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -23,7 +23,6 @@ export  hilbert,
         rmsfft,
         meanfreq,
         unsafe_dot,
-        polyfit,
         shiftin!,
         finddelay,
         shiftsignal,

From 9ecb495ebe87e42dc5d527484d262042666ff1a3 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Sun, 24 Nov 2019 11:44:48 -0500
Subject: [PATCH 09/19] Add `using DSP` to all doctests

Some doctests were failing because `using DSP` was not specifically included in
the example. I have added a setup step in doc/make.jl that includes the DSP
module in all doctests, so that examples that do not explicitly include `using
DSP` will still be able to use DSP functions.
---
 docs/make.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/make.jl b/docs/make.jl
index 409adec6a..7a7ecb52e 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,5 +1,7 @@
 using Documenter, DSP
 
+DocMeta.setdocmeta!(DSP, :DocTestSetup, :(using DSP); recursive=true)
+
 makedocs(
     modules = [DSP],
     format = Documenter.HTML(),

From bfaeaf4302cb104f3f4e6515275b96dda00a5cbb Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Sun, 24 Nov 2019 11:47:49 -0500
Subject: [PATCH 10/19] Add Julia 1.2 to the CI matrix

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index ca08076be..d862fc82c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ os:
 julia:
   - 1.0
   - 1.1
+  - 1.2
   - nightly
 matrix:
   allow_failures:

From 8edec5c75a9461869c324ff6db4eefd7fd789a35 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Sun, 24 Nov 2019 15:46:15 -0500
Subject: [PATCH 11/19] Remove references to fftfreq in documentation

These are no longer in DSP.jl, but links to their documentation were still
present in the documentation of DSP.jl.
---
 docs/src/util.md    | 15 ++++++---------
 src/periodograms.jl |  2 --
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/docs/src/util.md b/docs/src/util.md
index d7e74def7..c3a75dc88 100644
--- a/docs/src/util.md
+++ b/docs/src/util.md
@@ -1,9 +1,10 @@
 # `Util` - utility functions
-```@meta
-DocTestSetup = quote
-    using DSP
-end
-```
+
+!!! note
+    As of version 0.6.1 of DSP.jl, `fftfreq` and `rfftfreq` have been moved from
+    DSP.jl to [AbstractFFTs.jl](https://github.com/JuliaMath/AbstractFFTs.jl)
+    version 0.5 and above. You can also access these functions through
+    [FFTW.jl](https://github.com/JuliaMath/FFTW.jl) version 1.1 and above.
 
 ```@docs
 unwrap
@@ -23,7 +24,3 @@ shiftsignal!
 alignsignals
 alignsignals!
 ```
-
-```@meta
-DocTestSetup = nothing
-```
diff --git a/src/periodograms.jl b/src/periodograms.jl
index beed3baa2..b33d4718e 100644
--- a/src/periodograms.jl
+++ b/src/periodograms.jl
@@ -213,8 +213,6 @@ Spectrogram object.
 
 Returns a tuple of frequency bin centers for a given Periodogram2
 object.
-
-See also: [`fftfreq`](@ref), [`rfftfreq`](@ref)
 """
 freq(p::TFR) = p.freq
 freq(p::Periodogram2) = (p.freq1, p.freq2)

From d7e0d8d15008a79928d50a86d014a645e47caac8 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Mon, 25 Nov 2019 16:59:53 -0500
Subject: [PATCH 12/19] Create CompatHelper.yml

---
 .github/workflows/CompatHelper.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/CompatHelper.yml

diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
new file mode 100644
index 000000000..68dbe39cd
--- /dev/null
+++ b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,24 @@
+name: CompatHelper
+
+on:
+  schedule:
+    - cron: '00 * * * *'
+
+jobs:
+  CompatHelper:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        julia-version: [1.2.0]
+        julia-arch: [x86]
+        os: [ubuntu-latest]
+    steps:
+      - uses: julia-actions/setup-julia@latest
+        with:
+          version: ${{ matrix.julia-version }}
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'

From 07a13a1a991e11a049ef96caef736f9bbaf75001 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 27 Nov 2019 05:10:31 +0000
Subject: [PATCH 13/19] CompatHelper: bump compat for "SpecialFunctions" to
 "0.9"

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index e2c62fb41..ab099d162 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,7 +16,7 @@ FFTW = "1.1"
 OffsetArrays = "0.11"
 Polynomials = "0.6"
 Reexport = "0.2"
-SpecialFunctions = "0.8"
+SpecialFunctions = "0.8, 0.9"
 julia = "1"
 
 [extras]

From 05a58452dd33483dfcf2905a3a030ba7a122a0ec Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Wed, 27 Nov 2019 06:44:31 -0500
Subject: [PATCH 14/19] Create TagBot.yml

---
 .github/workflows/TagBot.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 .github/workflows/TagBot.yml

diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
new file mode 100644
index 000000000..150f0c4e9
--- /dev/null
+++ b/.github/workflows/TagBot.yml
@@ -0,0 +1,12 @@
+name: TagBot
+on:
+  schedule:
+    - cron: 0 * * * *
+jobs:
+  TagBot:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v1
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}

From 7328081b1fb3b1a1e8035a99cf5eec4da66a93ad Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Wed, 27 Nov 2019 22:32:34 -0500
Subject: [PATCH 15/19] Add Julia 1.3 to CI (#331)

Add Julia 1.3 to CI, and also use version "1" to use latest tagged 1.x version of Julia.
---
 .travis.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.travis.yml b/.travis.yml
index d862fc82c..dfe843061 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -5,6 +5,8 @@ julia:
   - 1.0
   - 1.1
   - 1.2
+  - 1.3
+  - 1
   - nightly
 matrix:
   allow_failures:

From 63a639c5d97aff783a91c1521211b3273a5a299c Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Wed, 27 Nov 2019 22:27:36 -0500
Subject: [PATCH 16/19] Bump version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index ab099d162..ddc828fde 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "DSP"
 uuid = "717857b8-e6f2-59f4-9121-6e50c889abd2"
-version = "0.6.1"
+version = "0.6.2"
 
 [deps]
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"

From fa33e6c4a9ddaf3f2afba5ebe4f3421ccda28516 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Fri, 12 Apr 2019 19:01:29 -0400
Subject: [PATCH 17/19] Overlap-save convolution

I have added an overlap-save algorithm to convolution, which is substantially
faster (~2x) for large convolution problems. It is also substantially faster
than other packages like ImageFiltering.jl. For smaller convolutions, it is
about the same or a little bit slower than standard fft convolution. The cutoff
between overlap-save and simple fft convolution is pretty arbitrary at the
moment, so it would be nice to add some benchmarking to allow DSP.jl to choose
between the two algorithms depending on the size of the input. It would also be
nice to add spatial/time domain filtering for small convolutions, as it would
almost certainly be faster than fft convolutions. This is relevant to #224.
---
 Project.toml           |   2 +
 src/DSP.jl             |   1 +
 src/Filters/Filters.jl |   2 +-
 src/Filters/filt.jl    |  26 --
 src/dspbase.jl         | 557 +++++++++++++++++++++++++++++++++++------
 src/util.jl            |   4 +-
 test/dsp.jl            |  54 ++++
 7 files changed, 536 insertions(+), 110 deletions(-)

diff --git a/Project.toml b/Project.toml
index ddc828fde..ca99f7c7f 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 
 [compat]
 FFTW = "1.1"
@@ -18,6 +19,7 @@ Polynomials = "0.6"
 Reexport = "0.2"
 SpecialFunctions = "0.8, 0.9"
 julia = "1"
+IterTools = "1"
 
 [extras]
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
diff --git a/src/DSP.jl b/src/DSP.jl
index 57e68ba58..da0eb5075 100644
--- a/src/DSP.jl
+++ b/src/DSP.jl
@@ -2,6 +2,7 @@ module DSP
 
 using FFTW
 using LinearAlgebra: mul!, rmul!
+using IterTools: subsets
 
 export conv, conv2, deconv, filt, filt!, xcorr
 
diff --git a/src/Filters/Filters.jl b/src/Filters/Filters.jl
index 8bdec3c40..98805f371 100644
--- a/src/Filters/Filters.jl
+++ b/src/Filters/Filters.jl
@@ -4,7 +4,7 @@ using Polynomials, ..Util
 import Base: *
 using LinearAlgebra: I, mul!, rmul!
 using Statistics: middle
-import ..DSP: filt, filt!, SMALL_FILT_CUTOFF
+import ..DSP: filt, filt!, optimalfftfiltlength, os_fft_complexity, SMALL_FILT_CUTOFF
 using FFTW
 
 include("coefficients.jl")
diff --git a/src/Filters/filt.jl b/src/Filters/filt.jl
index 153048f8b..f032efb22 100644
--- a/src/Filters/filt.jl
+++ b/src/Filters/filt.jl
@@ -414,32 +414,6 @@ filt(h::AbstractArray, x::AbstractArray) =
 # fftfilt and filt
 #
 
-# Rough estimate of number of multiplications per output sample
-os_fft_complexity(nfft, nb) =  (nfft * log2(nfft) + nfft) / (nfft - nb + 1)
-
-# Determine optimal length of the FFT for fftfilt
-function optimalfftfiltlength(nb, nx)
-    first_pow2 = ceil(Int, log2(nb))
-    last_pow2 = ceil(Int, log2(nx + nb - 1))
-    last_complexity = os_fft_complexity(2 ^ first_pow2, nb)
-    pow2 = first_pow2 + 1
-    while pow2 <= last_pow2
-        new_complexity = os_fft_complexity(2 ^ pow2, nb)
-        new_complexity > last_complexity && break
-        last_complexity = new_complexity
-        pow2 += 1
-    end
-    nfft = pow2 > last_pow2 ? 2 ^ last_pow2 : 2 ^ (pow2 - 1)
-
-    L = nfft - nb + 1
-    if L > nx
-        # If L > nx, better to find next fast power
-        nfft = nextfastfft(nx + nb - 1)
-    end
-
-    nfft
-end
-
 """
     fftfilt(h, x)
 
diff --git a/src/dspbase.jl b/src/dspbase.jl
index 4ecadd3b4..c73efc14c 100644
--- a/src/dspbase.jl
+++ b/src/dspbase.jl
@@ -1,6 +1,6 @@
 # This file was formerly a part of Julia. License is MIT: https://julialang.org/license
 
-import Base.trailingsize
+import Base: trailingsize
 import LinearAlgebra.BLAS
 
 const SMALL_FILT_CUTOFF = 58
@@ -170,138 +170,533 @@ function deconv(b::StridedVector{T}, a::StridedVector{T}) where T
     filt(b, a, x)
 end
 
-function _zeropad!(
-    padded::AbstractVector, u::AbstractVector, ::Tuple{Base.OneTo{Int}}
+
+"""
+    _zeropad!(padded::AbstractVector,
+              u::AbstractVector,
+              data_dest::Tuple = (1,),
+              data_region = CartesianIndices(u),
+              ::Tuple{Base.OneTo{Int}})
+
+Place the portion of `u` specified by `data_region` into `padded` starting at
+location `data_dest`, and set the rest of `padded` to zero. This will mutate
+`padded`.
+
+This method handles the case where `padded` uses 1-based indexing, but `u` can
+have arbitrary axes.
+"""
+@inline function _zeropad!(
+    padded::AbstractVector,
+    u::AbstractVector,
+    padded_axes = axes(padded),
+    data_dest::Tuple = (first(padded_axes[1]),),
+    data_region = CartesianIndices(u),
 )
-    datasize = length(u)
+    datasize = length(data_region)
     # Use axes to accommodate arrays that do not start at index 1
-    padsize = length(padded)
-    data_first_i = first(axes(u, 1))
-    copyto!(padded, 1, u, data_first_i, datasize)
-    padded[1 + datasize:padsize] .= 0
+    data_first_i = first(data_region)[1]
+    dest_first_i = data_dest[1]
+    copyto!(padded, dest_first_i, u, data_first_i, datasize)
+    padded[first(padded_axes[1]):dest_first_i - 1] .= 0
+    padded[dest_first_i + datasize : end] .= 0
 
     padded
 end
 
-function _zeropad!(
-    padded::AbstractVector, u::AbstractVector, pad_axes
+@inline function _zeropad!(
+    padded::AbstractArray,
+    u::AbstractArray,
+    padded_axes = axes(padded),
+    data_dest::Tuple = first.(padded_axes),
+    data_region = CartesianIndices(u),
 )
-    datasize = length(u)
-    # Use axes to accommodate arrays that do not start at index 1
-    data_first_i = first(axes(u, 1))
-    pad_first_i = first(pad_axes[1])
-    copyto!(padded, pad_first_i, u, data_first_i, datasize)
-    padded[pad_first_i + datasize : last(pad_axes[1])] .= 0
+    fill!(padded, zero(eltype(padded)))
+    dest_axes = UnitRange.(data_dest, data_dest .+ size(data_region) .- 1)
+    dest_region = CartesianIndices(dest_axes)
+    copyto!(padded, dest_region, u, data_region)
 
     padded
 end
 
-function _zeropad!(
-    padded::AbstractArray{<:Any, N},
-    u::AbstractArray{<:Any, N},
-    ::NTuple{<:Any, Base.OneTo{Int}}
-) where N
-    # Copy the data to the beginning of the padded array
-    fill!(padded, zero(eltype(padded)))
-    pad_data_ranges = UnitRange.(1, size(u))
-    copyto!(padded, CartesianIndices(pad_data_ranges), u, CartesianIndices(u))
+"""
+    _zeropad(u, padded_size, [data_dest, data_region])
 
-    padded
+Creates and returns a new base-1 index array of size `padded_size`, with the
+section of `u` specified by `data_region` copied into the region of the new
+ array as specified by `data_dest`. All other values will be initialized to
+ zero.
+
+If either `data_dest` or `data_region` is not specified, then the defaults
+described in [`_zeropad!`](@ref) will be used.
+"""
+function _zeropad(u, padded_size, args...)
+    padded = similar(u, padded_size)
+    _zeropad!(padded, u, axes(padded), args...)
 end
 
-function _zeropad!(
-    padded::AbstractArray{<:Any, N},
-    u::AbstractArray{<:Any, N},
-    pad_axes
-) where N
-    # Copy the data to the beginning of the padded array
-    fill!(padded, zero(eltype(padded)))
-    pad_first_i = first.(pad_axes)
-    pad_data_ranges = UnitRange.(pad_first_i, pad_first_i .+ size(u) .- 1)
-    copyto!(padded, CartesianIndices(pad_data_ranges), u, CartesianIndices(u))
+function _zeropad_keep_offset(u, padded_size, u_axes, args...)
+    ax_starts = first.(u_axes)
+    new_axes = UnitRange.(ax_starts, ax_starts .+ padded_size .- 1)
+    _zeropad!(similar(u, new_axes), u, args...)
+end
 
-    padded
+function _zeropad_keep_offset(
+    u, padded_size, ::NTuple{<:Any, Base.OneTo{Int}}, args...
+)
+    _zeropad(u, padded_size, args...)
+end
+
+"""
+    _zeropad_keep_offset(u, padded_size, [data_dest, dat_region])
+
+Like [`_zeropad`](@ref), but retains the first index of `u` when creating a new
+array.
+"""
+function _zeropad_keep_offset(u, padded_size, args...)
+    _zeropad_keep_offset(u, padded_size, axes(u), args...)
+end
+
+"""
+Estimate the number of floating point multiplications per output sample for an
+overlap-save algorithm with fft size `nfft`, and filter size `nb`.
+"""
+os_fft_complexity(nfft, nb) =  (nfft * log2(nfft) + nfft) / (nfft - nb + 1)
+
+"""
+Determine the length of FFT that minimizes the number of multiplications per
+output sample for an overlap-save convolution of vectors of size `nb` and `nx`.
+"""
+function optimalfftfiltlength(nb, nx)
+    nfull = nb + nx - 1
+
+    # Step through possible nffts and find the nfft that minimizes complexity
+    # Assumes that complexity is convex
+    first_pow2 = ceil(Int, log2(nb))
+    prev_pow2 = ceil(Int, log2(nfull))
+    prev_complexity = os_fft_complexity(2 ^ first_pow2, nb)
+    pow2 = first_pow2 + 1
+    while pow2 <= prev_pow2
+        new_complexity = os_fft_complexity(2 ^ pow2, nb)
+        new_complexity > prev_complexity && break
+        prev_complexity = new_complexity
+        pow2 += 1
+    end
+    nfft = pow2 > prev_pow2 ? 2 ^ prev_pow2 : 2 ^ (pow2 - 1)
+
+    # L is the number of usable samples produced by each block
+    L = nfft - nb + 1
+    if L > nx
+        # If L > nx, better to find next fast power
+        nfft = nextfastfft(nfull)
+    end
+
+    nfft
 end
 
-_zeropad!(padded, u) = _zeropad!(padded, u, axes(padded))
+"""
+Prepare buffers and FFTW plans for convolution. The two buffers, tdbuff and
+fdbuff may be an alias of each other, because complex convolution only requires
+one buffer. The plans are mutating where possible, and the inverse plan is
+unnormalized.
+"""
+@inline function os_prepare_conv(u::AbstractArray{T, N},
+                                 nffts) where {T<:Real, N}
+    tdbuff = similar(u, nffts)
+    bufsize = ntuple(i -> i == 1 ? nffts[i] >> 1 + 1 : nffts[i], N)
+    fdbuff = similar(u, Complex{T}, NTuple{N, Int}(bufsize))
 
-function _zeropad(u, padded_size)
-    _zeropad!(similar(u, padded_size), u)
+    p = plan_rfft(tdbuff)
+    ip = plan_brfft(fdbuff, nffts[1])
+
+    tdbuff, fdbuff, p, ip
 end
 
-function _zeropad_keep_offset(u, padded_size, ::NTuple{<:Any, Base.OneTo{Int}})
-    _zeropad(u, padded_size)
+@inline function os_prepare_conv(u::AbstractArray{<:Complex}, nffts)
+    buff = similar(u, nffts)
+
+    p = plan_fft!(buff)
+    ip = plan_bfft!(buff)
+
+    buff, buff, p, ip # Only one buffer for complex
 end
 
-function _zeropad_keep_offset(u, padded_size, axes_u)
-    ax_starts = first.(axes_u)
-    new_axes = UnitRange.(ax_starts, ax_starts .+ padded_size .- 1)
-    _zeropad!(similar(u, new_axes), u, new_axes)
+"""
+Transform the smaller convolution input to frequency domain, and return it in a
+new array. However, the contents of `buff` may be modified.
+"""
+@inline function os_filter_transform!(buff::AbstractArray{<:Real}, p)
+    p * buff
 end
 
-_zeropad_keep_offset(u, padded_size) = _zeropad_keep_offset(
-    u, padded_size, axes(u)
-)
+@inline function os_filter_transform!(buff::AbstractArray{<:Complex}, p!)
+    copy(p! * buff) # p operates in place on buff
+end
+
+"""
+Take a block of data, and convolve it with the smaller convolution input. This
+may modify the contents of `tdbuff` and `fdbuff`, and the result will be in
+`tdbuff`.
+"""
+@inline function os_conv_block!(tdbuff::AbstractArray{<:Real},
+                                fdbuff::AbstractArray,
+                                filter_fd,
+                                p,
+                                ip)
+    mul!(fdbuff, p, tdbuff)
+    fdbuff .*= filter_fd
+    mul!(tdbuff, ip, fdbuff)
+end
+
+"Like the real version, but only operates on one buffer"
+@inline function os_conv_block!(buff::AbstractArray{<:Complex},
+                                ::AbstractArray, # Only one buffer for complex
+                                filter_fd,
+                                p!,
+                                ip!)
+    p! * buff # p! operates in place on buff
+    buff .*= filter_fd
+    ip! * buff # ip! operates in place on buff
+end
+
+# Used by `unsafe_conv_kern_os!` to handle blocks of input data that need to be padded.
+#
+# For a given number of edge dimensions, convolve all regions along the
+# perimeter that have that number of edge dimensions
+#
+# For a 3d cube, if n_edges = 1, this means all faces. If n_edges = 2, then
+# all edges. Finally, if n_edges = 3, then all corners.
+#
+# This needs to be a separate function for subsets to generate tuple elements,
+# which is only the case if the number of edges is a Val{n} type. Iterating over
+# the number of edges with Val{n} is inherently type unstable, so this function
+# boundary allows dispatch to make efficient code for each number of edge
+# dimensions.
+function unsafe_conv_kern_os_edge!(
+    # These arrays and buffers will be mutated
+    out::AbstractArray{<:Any, N},
+    tdbuff,
+    fdbuff,
+    perimeter_range,
+    # Number of edge dimensions to pad and convolve
+    n_edges::Val,
+    # Data to be convolved
+    u,
+    filter_fd,
+    # FFTW plans
+    p,
+    ip,
+    # Sizes, ranges, and other pre-calculated constants
+    #
+    ## ranges listing center and edge blocks
+    edge_ranges,
+    center_block_ranges,
+    ## size and axis information
+    all_dims, # 1:N
+    su,
+    u_start,
+    sv,
+    nffts,
+    out_start,
+    out_stop,
+    save_blocksize,
+    sout_deficit, # How many output samples are missing if nffts > sout
+    tdbuff_axes,
+) where N
+    # Iterate over all possible combinations of edge dimensions for a number of
+    # edges
+    #
+    # For a 3d cube with n_edges = 1, this will specify the top and bottom faces
+    # (edge_dims = (1,)), then the left and right faces (edge_dims = (2,)), then
+    # the front and back faces (edge_dims = (3,))
+    for edge_dims in subsets(all_dims, n_edges)
+        # Specify a region on the perimeter by combining an edge block index for
+        # each dimension on an edge, and the central blocks for dimensions not
+        # on an edge.
+        #
+        # First make all entries equal to the center blocks:
+        @inbounds copyto!(perimeter_range, 1, center_block_ranges, 1, N)
+
+        # For the dimensions chosen to be on an edge (edge_dims), get the
+        # ranges of the blocks that would need to be padded (lie on an edge)
+        # in that dimension.
+        #
+        # There can be one to two such ranges for each dimension, because with
+        # some inputs sizes the whole convolution is just one range
+        # (one edge block), or the padding will be needed at both the leading
+        # and trailing side of that dimension
+        selected_edge_ranges = getindex.(Ref(edge_ranges), edge_dims)
+
+        # Visit each combination of edge ranges for the edge dimensions chosen.
+        # For a 3d cube with n_edges = 1 and edge_dims = (1,), this will visit
+        # the top face, and then the bottom face.
+        for perimeter_edge_ranges in Iterators.ProductIterator(selected_edge_ranges)
+            # The center region for non-edge dimensions has been specified above,
+            # so finish specifying the region of the perimeter for this edge
+            # block
+            @inbounds for (i, dim) in enumerate(edge_dims)
+                perimeter_range[dim] = perimeter_edge_ranges[i]
+            end
+
+            # Region of block indices, not data indices!
+            block_region = CartesianIndices(
+                NTuple{N, UnitRange{Int}}(perimeter_range)
+            )
+            @inbounds for block_pos in block_region
+                # Figure out which portion of the input data should be transformed
+
+                block_idx = convert(NTuple{N, Int}, block_pos)
+                ## data_offset is NOT the beginning of the region that will be
+                ## convolved, but is instead the beginning of the unaliased data.
+                data_offset = save_blocksize .* (block_idx .- 1)
+                ## How many zeros will need to be added before the data
+                pad_before = max.(0, sv .- data_offset .- 1)
+                data_ideal_stop = data_offset .+ save_blocksize
+                ## How many zeros will need to be added after the data
+                pad_after = max.(0, data_ideal_stop .- su)
+
+                ## Data indices, not block indices
+                data_region = CartesianIndices(
+                    UnitRange.(
+                        u_start .+ data_offset .- sv .+ pad_before .+ 1,
+                        u_start .+ data_ideal_stop .- pad_after .- 1
+                    )
+                )
+
+                # Convolve portion of input
+
+                _zeropad!(tdbuff, u, tdbuff_axes, pad_before .+ 1, data_region)
+                os_conv_block!(tdbuff, fdbuff, filter_fd, p, ip)
+
+                # Save convolved result to output
+
+                block_out_stop = min.(
+                    out_start .+ data_offset .+ save_blocksize .- 1,
+                    out_stop
+                )
+                block_out_region = CartesianIndices(
+                    UnitRange.(out_start .+ data_offset, block_out_stop)
+                )
+                ## If the input could not fill tdbuff, account for that before
+                ## copying the convolution result to the output
+                u_deficit = max.(0, pad_after .- sv .+ 1)
+                valid_buff_region = CartesianIndices(
+                    UnitRange.(sv, nffts .- u_deficit .- sout_deficit)
+                )
+                copyto!(out, block_out_region, tdbuff, valid_buff_region)
+            end
+        end
+    end
+end
 
-function _conv(
-    u::AbstractArray{T, N}, v::AbstractArray{T, N}, paddims
-) where {T<:Real, N}
-    padded = _zeropad(u, paddims)
+# Assumes u is larger than, or the same size as, v
+function unsafe_conv_kern_os!(out,
+                        u::AbstractArray{<:Any, N},
+                        v,
+                        su,
+                        sv,
+                        sout,
+                        nffts) where N
+    u_start = first.(axes(u))
+    out_axes = axes(out)
+    out_start = first.(out_axes)
+    out_stop = last.(out_axes)
+    ideal_save_blocksize = nffts .- sv .+ 1
+    # Number of samples that are "missing" if the valid portion of the
+    # convolution is smaller than the output
+    sout_deficit = max.(0, ideal_save_blocksize .- sout)
+    # Size of the valid portion of the convolution result
+    save_blocksize = ideal_save_blocksize .- sout_deficit
+    nblocks = cld.(sout, save_blocksize)
+
+    # Pre-allocation
+    tdbuff, fdbuff, p, ip = os_prepare_conv(u, nffts)
+    tdbuff_axes = axes(tdbuff)
+
+    # Transform the smaller filter
+    _zeropad!(tdbuff, v)
+    filter_fd = os_filter_transform!(tdbuff, p)
+    filter_fd .*= 1 / prod(nffts) # Normalize once for brfft
+
+    last_full_blocks = fld.(su, save_blocksize)
+    # block indices for center blocks, which need no padding
+    center_block_ranges = UnitRange.(2, last_full_blocks)
+    # block index ranges for blocks that need to be padded
+    # Corresponds to the leading and trailing side of a dimension, or if there
+    # are no center blocks, corresponds to the whole dimension
+    edge_ranges = map(nblocks, last_full_blocks) do nblock, lastfull
+        lastfull > 1 ? [1:1, lastfull + 1 : nblock] : [1:nblock]
+    end
+    all_dims = 1:N
+    # Buffer to store ranges of indices for a single region of the perimeter
+    perimeter_range = Vector{UnitRange{Int}}(undef, N)
+
+    # Convolve all blocks that require padding.
+    #
+    # This is accomplished by dividing the perimeter of the volume into
+    # subsections, where the division is done by the number of edge dimensions.
+    # For a 3d cube, this convolves the perimeter in the following order:
+    #
+    # Number of Edge Dimensions | Convolved Region
+    # --------------------------+-----------------
+    #                         1 | Faces of Cube
+    #                         2 | Edges of Cube
+    #                         3 | Corners of Cube
+    #
+    for n_edges in all_dims
+        unsafe_conv_kern_os_edge!(
+            # These arrays and buffers will be mutated
+            out,
+            tdbuff,
+            fdbuff,
+            perimeter_range,
+            # Number of edge dimensions to pad and convolve
+            Val(n_edges),
+            # Data to be convolved
+            u,
+            filter_fd,
+            # FFTW plans
+            p,
+            ip,
+            # Sizes, ranges, and other pre-calculated constants
+            #
+            ## ranges listing center and edge blocks
+            edge_ranges,
+            center_block_ranges,
+            ## size and axis information
+            all_dims, # 1:N
+            su,
+            u_start,
+            sv,
+            nffts,
+            out_start,
+            out_stop,
+            save_blocksize,
+            sout_deficit,
+            tdbuff_axes) # How many output samples are missing if nffts > sout
+    end
+
+    tdbuff_region = CartesianIndices(tdbuff)
+    # Portion of buffer with valid result of convolution
+    valid_buff_region = CartesianIndices(UnitRange.(sv, nffts))
+    # Iterate over block indices (not data indices) that do not need to be padded
+    @inbounds for block_pos in CartesianIndices(center_block_ranges)
+        # Calculate portion of data to transform
+
+        block_idx = convert(NTuple{N, Int}, block_pos)
+        ## data_offset is NOT the beginning of the region that will be
+        ## convolved, but is instead the beginning of the unaliased data.
+        data_offset = save_blocksize .* (block_idx .- 1)
+        data_stop = data_offset .+ save_blocksize
+        data_region = CartesianIndices(
+            UnitRange.(u_start .+ data_offset .- sv .+ 1, u_start .+ data_stop .- 1)
+        )
+
+        # Convolve this portion of the data
+
+        copyto!(tdbuff, tdbuff_region, u, data_region)
+        os_conv_block!(tdbuff, fdbuff, filter_fd, p, ip)
+
+        # Save convolved result to output
+
+        block_out_region = CartesianIndices(
+            UnitRange.(data_offset .+ out_start, data_stop .+ out_start .- 1)
+        )
+        copyto!(out, block_out_region, tdbuff, valid_buff_region)
+    end
+
+    out
+end
+
+function _conv_kern_fft!(out,
+                         u::AbstractArray{T, N},
+                         v::AbstractArray{T, N},
+                         su,
+                         sv,
+                         outsize,
+                         nffts) where {T<:Real, N}
+    padded = _zeropad(u, nffts)
     p = plan_rfft(padded)
     uf = p * padded
     _zeropad!(padded, v)
     vf = p * padded
     uf .*= vf
-    irfft(uf, paddims[1])
+    raw_out = irfft(uf, nffts[1])
+    copyto!(out,
+            CartesianIndices(out),
+            raw_out,
+            CartesianIndices(UnitRange.(1, outsize)))
 end
-function _conv(u, v, paddims)
-    upad = _zeropad(u, paddims)
-    vpad = _zeropad(v, paddims)
+function _conv_kern_fft!(out, u, v, su, sv, outsize, nffts)
+    upad = _zeropad(u, nffts)
+    vpad = _zeropad(v, nffts)
     p! = plan_fft!(upad)
     p! * upad # Operates in place on upad
     p! * vpad
     upad .*= vpad
     ifft!(upad)
+    copyto!(out,
+            CartesianIndices(out),
+            upad,
+            CartesianIndices(UnitRange.(1, outsize)))
 end
 
-function _conv_clip!(
-    y::AbstractVector,
-    minpad,
-    ::NTuple{<:Any, Base.OneTo{Int}},
-    ::NTuple{<:Any, Base.OneTo{Int}}
-)
-    sizehint!(resize!(y, minpad[1]), minpad[1])
-end
-function _conv_clip!(
-    y::AbstractArray,
-    minpad,
-    ::NTuple{<:Any, Base.OneTo{Int}},
-    ::NTuple{<:Any, Base.OneTo{Int}}
-)
-    y[CartesianIndices(minpad)]
+# v should be smaller than u for good performance
+function _conv_fft!(out, u, v, su, sv, outsize)
+    os_nffts = map(optimalfftfiltlength, sv, su)
+    if any(os_nffts .< outsize)
+        unsafe_conv_kern_os!(out, u, v, su, sv, outsize, os_nffts)
+    else
+        nffts = nextfastfft(outsize)
+        _conv_kern_fft!(out, u, v, su, sv, outsize, nffts)
+    end
 end
+
+
 # For arrays with weird offsets
-function _conv_clip!(y::AbstractArray, minpad, axesu, axesv)
+function _conv_similar(u, outsize, axesu, axesv)
     out_offsets = first.(axesu) .+ first.(axesv)
-    out_axes = UnitRange.(out_offsets, out_offsets .+ minpad .- 1)
-    out = similar(y, out_axes)
-    copyto!(out, CartesianIndices(out), y, CartesianIndices(UnitRange.(1, minpad)))
+    out_axes = UnitRange.(out_offsets, out_offsets .+ outsize .- 1)
+    similar(u, out_axes)
+end
+function _conv_similar(
+    u, outsize, ::NTuple{<:Any, Base.OneTo{Int}}, ::NTuple{<:Any, Base.OneTo{Int}}
+)
+    similar(u, outsize)
+end
+_conv_similar(u, v, outsize) = _conv_similar(u, outsize, axes(u), axes(v))
+
+# Does convolution, will not switch argument order
+function _conv!(out, u, v, su, sv, outsize)
+    # TODO: Add spatial / time domain algorithm
+    _conv_fft!(out, u, v, su, sv, outsize)
 end
 
+# Does convolution, will not switch argument order
+function _conv(u, v, su, sv)
+    outsize = su .+ sv .- 1
+    out = _conv_similar(u, v, outsize)
+    _conv!(out, u, v, su, sv, outsize)
+end
+
+# May switch argument order
 """
     conv(u,v)
 
-Convolution of two arrays. Uses FFT algorithm.
+Convolution of two arrays. Uses either FFT convolution or overlap-save,
+depending on the size of the input. `u` and `v` can be  N-dimensional arrays,
+with arbitrary indexing offsets, but their axes must be a `UnitRange`.
 """
 function conv(u::AbstractArray{T, N},
               v::AbstractArray{T, N}) where {T<:BLAS.BlasFloat, N}
     su = size(u)
     sv = size(v)
-    minpad = su .+ sv .- 1
-    padsize = map(n -> n > 1024 ? nextprod([2,3,5], n) : nextpow(2, n), minpad)
-    y = _conv(u, v, padsize)
-    _conv_clip!(y, minpad, axes(u), axes(v))
+    if prod(su) >= prod(sv)
+        _conv(u, v, su, sv)
+    else
+        _conv(v, u, sv, su)
+    end
 end
+
 function conv(u::AbstractArray{<:BLAS.BlasFloat, N},
               v::AbstractArray{<:BLAS.BlasFloat, N}) where N
     fu, fv = promote(u, v)
diff --git a/src/util.jl b/src/util.jl
index c08803f6d..1fffd6f01 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -118,8 +118,8 @@ computes Fourier transforms of input that is a product of these
 sizes faster than for input of other sizes.
 """
 nextfastfft(n) = nextprod(FAST_FFT_SIZES, n)
-nextfastfft(n1, n2...) = tuple(nextfastfft(n1), nextfastfft(n2...)...)
-nextfastfft(n::Tuple) = nextfastfft(n...)
+nextfastfft(ns::Tuple) = nextfastfft.(ns)
+nextfastfft(ns...) = nextfastfft(ns)
 
 
 ## COMMON DSP TOOLS
diff --git a/test/dsp.jl b/test/dsp.jl
index 4522fe0ec..2f8600154 100644
--- a/test/dsp.jl
+++ b/test/dsp.jl
@@ -2,6 +2,8 @@
 # TODO: parameterize conv tests
 using Test, DSP, OffsetArrays
 import DSP: filt, filt!, deconv, conv, xcorr
+using DSP: optimalfftfiltlength, unsafe_conv_kern_os!, _conv_kern_fft!, _conv_similar,
+    nextfastfft
 
 
 
@@ -34,6 +36,8 @@ end
 
 @testset "conv" begin
 
+    @test optimalfftfiltlength(3, 1) == 3 # Should be at least the first input
+
     @testset "conv-1D" begin
         # Convolution
         a = [1, 2, 1, 2]
@@ -169,6 +173,56 @@ end
         fb = convert(Array{Float64}, b)
         @test conv(a, fb) ≈ convert(Array{Float64}, exp)
     end
+
+    @testset "Overlap-Save" begin
+        to_sizetuple = (diml, N) -> ntuple(_ -> diml, N)
+        function os_test_data(eltype, diml, N)
+            s = to_sizetuple(diml, N)
+            arr = rand(eltype, s)
+            s, arr
+        end
+        function test_os(eltype, nu, nv, N, nfft)
+            nffts = to_sizetuple(nfft, N)
+            su, u = os_test_data(eltype, nu, N)
+            sv, v = os_test_data(eltype, nv, N)
+            sout = su .+ sv .- 1
+            out = _conv_similar(u, sout, axes(u), axes(v))
+            unsafe_conv_kern_os!(out, u, v, su, sv, sout, nffts)
+            os_out = copy(out)
+            fft_nfft = nextfastfft(sout)
+            _conv_kern_fft!(out, u, v, su, sv, sout, fft_nfft)
+            @test out ≈ os_out
+        end
+        Ns = [1, 2, 3]
+        eltypes = [Float32, Float64, Complex{Float64}]
+        nlarge = 128
+
+        regular_nsmall = [12, 128]
+        for numdim in Ns
+            for elt in eltypes
+                for nsmall in regular_nsmall
+                    nfft = optimalfftfiltlength(nlarge, nsmall)
+                    test_os(elt, nlarge, nsmall, Val{numdim}(), nfft)
+                end
+            end
+        end
+
+        # small = 12, fft = 256 exercises output being smaller than the normal
+        # valid region of OS convolution block
+        #
+        # small = 13, fft = 32 makes sout evenly divisble by nfft - sv + 1
+        # small = 12, fft = 32 does not
+        adversarial_nsmall_nfft = [(12, 256), (13, 32), (12, 32)]
+        for (nsmall, nfft) in adversarial_nsmall_nfft
+            test_os(Float64, nlarge, nsmall, Val{1}(), nfft)
+        end
+
+        # Should not bug:
+        conv(zeros(4, 7, 1), zeros(3, 3, 3))
+
+        # three blocks need to be padded in the following case:
+        test_os(Float64, 25, 4, Val{1}(), 16)
+    end
 end
 
 @testset "xcorr" begin

From 9dd643d70f5cbd952abe95b1258eaa16a065b733 Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Mon, 18 Nov 2019 10:07:35 -0500
Subject: [PATCH 18/19] Simplify correction of using a larger FFT than
 necessary in conv

This commit simplifies the correction of using a larger fft than necessary while
doing convolution, as per @HDictus' suggestion. This can occur in
`optimalfftfiltlength`, which searches `nfft` over powers of two to minimized
the total complexity of the convolution. However, when `nfft` is larger than the
full convolution, it makes sense to consider other fast powers. This condition
was previously detected based on the output of a fft convolution being larger
than the input. However, in this same condition, `nfft` is also larger than the
full convolution size, `nfull`. Because `nfft` and `nfull` are already computed,
it is faster to not calculate the output size to detect this condition.
---
 src/dspbase.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/dspbase.jl b/src/dspbase.jl
index c73efc14c..546103533 100644
--- a/src/dspbase.jl
+++ b/src/dspbase.jl
@@ -283,10 +283,8 @@ function optimalfftfiltlength(nb, nx)
     end
     nfft = pow2 > prev_pow2 ? 2 ^ prev_pow2 : 2 ^ (pow2 - 1)
 
-    # L is the number of usable samples produced by each block
-    L = nfft - nb + 1
-    if L > nx
-        # If L > nx, better to find next fast power
+    if nfft > nfull
+        # If nfft > nfull, then it's better to find next fast power
         nfft = nextfastfft(nfull)
     end
 

From f6c051d562dd7422c52f9628fccf391c6413723b Mon Sep 17 00:00:00 2001
From: Galen Lynch <galen.lynch@gmail.com>
Date: Thu, 2 Jan 2020 18:46:22 -0500
Subject: [PATCH 19/19] Address overlap-save review comments

Some old comments by @ssfrr had not been addressed, mostly relating to
documentation matching the code. One more substantial comment was making the
overlap save algorithm work in situations where `nffts < 2 * sv -1`, in which
case more than the first "block" will require padding. I have addressed these
comments with this commit. Interestingly, this will also eliminate situations
where no blocks require padding (e.g. if `nv = 1` in a dimension).

In addition, I have changed some variable names in `optimalfftfiltlength` for
clarity, and also changed some tests which were using the old interface, which
used to have `nb` and `nx` reversed.
---
 src/dspbase.jl | 33 +++++++++++++++++----------------
 test/dsp.jl    |  4 ++--
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/dspbase.jl b/src/dspbase.jl
index 546103533..2b0d9aac7 100644
--- a/src/dspbase.jl
+++ b/src/dspbase.jl
@@ -174,16 +174,14 @@ end
 """
     _zeropad!(padded::AbstractVector,
               u::AbstractVector,
+              padded_axes = axes(padded),
               data_dest::Tuple = (1,),
-              data_region = CartesianIndices(u),
-              ::Tuple{Base.OneTo{Int}})
+              data_region = CartesianIndices(u))
 
-Place the portion of `u` specified by `data_region` into `padded` starting at
-location `data_dest`, and set the rest of `padded` to zero. This will mutate
-`padded`.
+Place the portion of `u` specified by `data_region` into `padded`, starting at
+location `data_dest`. Sets the rest of `padded` to zero. This will mutate
+`padded`. `padded_axes` must correspond to the axes of `padded`.
 
-This method handles the case where `padded` uses 1-based indexing, but `u` can
-have arbitrary axes.
 """
 @inline function _zeropad!(
     padded::AbstractVector,
@@ -272,16 +270,16 @@ function optimalfftfiltlength(nb, nx)
     # Step through possible nffts and find the nfft that minimizes complexity
     # Assumes that complexity is convex
     first_pow2 = ceil(Int, log2(nb))
-    prev_pow2 = ceil(Int, log2(nfull))
+    max_pow2 = ceil(Int, log2(nfull))
     prev_complexity = os_fft_complexity(2 ^ first_pow2, nb)
     pow2 = first_pow2 + 1
-    while pow2 <= prev_pow2
+    while pow2 <= max_pow2
         new_complexity = os_fft_complexity(2 ^ pow2, nb)
         new_complexity > prev_complexity && break
         prev_complexity = new_complexity
         pow2 += 1
     end
-    nfft = pow2 > prev_pow2 ? 2 ^ prev_pow2 : 2 ^ (pow2 - 1)
+    nfft = pow2 > max_pow2 ? 2 ^ max_pow2 : 2 ^ (pow2 - 1)
 
     if nfft > nfull
         # If nfft > nfull, then it's better to find next fast power
@@ -487,6 +485,7 @@ function unsafe_conv_kern_os_edge!(
 end
 
 # Assumes u is larger than, or the same size as, v
+# nfft should be greater than or equal to 2*sv-1
 function unsafe_conv_kern_os!(out,
                         u::AbstractArray{<:Any, N},
                         v,
@@ -499,8 +498,8 @@ function unsafe_conv_kern_os!(out,
     out_start = first.(out_axes)
     out_stop = last.(out_axes)
     ideal_save_blocksize = nffts .- sv .+ 1
-    # Number of samples that are "missing" if the valid portion of the
-    # convolution is smaller than the output
+    # Number of samples that are "missing" if the output is smaller than the
+    # valid portion of the convolution
     sout_deficit = max.(0, ideal_save_blocksize .- sout)
     # Size of the valid portion of the convolution result
     save_blocksize = ideal_save_blocksize .- sout_deficit
@@ -515,14 +514,16 @@ function unsafe_conv_kern_os!(out,
     filter_fd = os_filter_transform!(tdbuff, p)
     filter_fd .*= 1 / prod(nffts) # Normalize once for brfft
 
-    last_full_blocks = fld.(su, save_blocksize)
     # block indices for center blocks, which need no padding
-    center_block_ranges = UnitRange.(2, last_full_blocks)
+    first_center_blocks = cld.(sv .- 1, save_blocksize) .+ 1
+    last_center_blocks = fld.(su, save_blocksize)
+    center_block_ranges = UnitRange.(first_center_blocks, last_center_blocks)
+
     # block index ranges for blocks that need to be padded
     # Corresponds to the leading and trailing side of a dimension, or if there
     # are no center blocks, corresponds to the whole dimension
-    edge_ranges = map(nblocks, last_full_blocks) do nblock, lastfull
-        lastfull > 1 ? [1:1, lastfull + 1 : nblock] : [1:nblock]
+    edge_ranges = map(nblocks, first_center_blocks, last_center_blocks) do nblock, firstfull, lastfull
+        lastfull > 1 ? [1:firstfull - 1, lastfull + 1 : nblock] : [1:nblock]
     end
     all_dims = 1:N
     # Buffer to store ranges of indices for a single region of the perimeter
diff --git a/test/dsp.jl b/test/dsp.jl
index 2f8600154..5fd8d7a19 100644
--- a/test/dsp.jl
+++ b/test/dsp.jl
@@ -36,7 +36,7 @@ end
 
 @testset "conv" begin
 
-    @test optimalfftfiltlength(3, 1) == 3 # Should be at least the first input
+    @test optimalfftfiltlength(1, 3) == 1 # Should be at least the first input
 
     @testset "conv-1D" begin
         # Convolution
@@ -201,7 +201,7 @@ end
         for numdim in Ns
             for elt in eltypes
                 for nsmall in regular_nsmall
-                    nfft = optimalfftfiltlength(nlarge, nsmall)
+                    nfft = optimalfftfiltlength(nsmall, nlarge)
                     test_os(elt, nlarge, nsmall, Val{numdim}(), nfft)
                 end
             end