Skip to content

Commit

Permalink
Re-enable SIMD
Browse files Browse the repository at this point in the history
Use a simpler SIMD algorithm than the one from ScanByte. This will give worse
performance, but still better than not using SIMD.
Initial benchmarks suggests a 2x performance increase in the best case.
  • Loading branch information
jakobnissen committed Oct 18, 2024
1 parent ee6f41b commit 3ab134c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 49 deletions.
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@ version = "1.0.4"

[deps]
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"

[compat]
julia = "1.6"
PrecompileTools = "1"
SIMD = "3.6.0"
TranscodingStreams = "0.9, 0.10, 0.11"
julia = "1.6"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
1 change: 1 addition & 0 deletions src/Automa.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Automa

using TranscodingStreams: TranscodingStreams, TranscodingStream, NoopStream
using SIMD: SIMD, Vec, vload

include("byteset.jl")

Expand Down
82 changes: 34 additions & 48 deletions src/codegen.jl
Original file line number Diff line number Diff line change
Expand Up @@ -472,28 +472,12 @@ function generate_goto_code(ctx::CodeGenContext, machine::Machine, actions::Dict
# This can be effectively SIMDd
# If such an edge is detected, we treat it specially with code here, and leave the
# non-SIMDable edges for below

# SIMD code temporarily disabled.
simd, non_simd = peel_simd_edge(s)
if simd !== nothing
push!(non_simd, (simd, s))
end
simd = nothing
simd_code = :()

#=
simd_code = if simd !== nothing
quote
$(generate_simd_loop(ctx, simd.labels))
if $(ctx.vars.p) > $(ctx.vars.p_end)
$(ctx.vars.cs) = $(s.state)
@goto exit
end
end
generate_simd_loop(ctx, simd.labels, s.state)
else
:()
end
=#

# If no inputs match, then we set cs = -cs to signal error, and go to exit
default = :($(ctx.vars.cs) = $(-s.state); @goto exit)
Expand Down Expand Up @@ -544,6 +528,7 @@ function generate_goto_code(ctx::CodeGenContext, machine::Machine, actions::Dict

# This is an overview of the final code structure
return quote
GC.@preserve $(ctx.vars.data) begin
$(ctx.vars.mem) = $(SizedMemory)($(ctx.vars.data))
if $(ctx.vars.p) > $(ctx.vars.p_end)
@goto exit
Expand All @@ -555,52 +540,53 @@ function generate_goto_code(ctx::CodeGenContext, machine::Machine, actions::Dict
$(eof_action_code)
$(ctx.vars.cs) = 0
end
end # GC.preserve
end
end


function append_code!(block::Expr, code::Expr)
@assert block.head == :block
@assert code.head == :block
append!(block.args, code.args)
return block
end

# Note: This function has been carefully crafted to produce (nearly) optimal
# assembly code for AVX2-capable CPUs. Change with great care.

# Temporarily disabled because I've come to the realization that Julia does not
# yet make it possible to robustly check what CPU instructions the user has available
# See related issue
#=
function generate_simd_loop(ctx::CodeGenContext, bs::ByteSet)
# ScanByte finds first byte in a byteset. We want to find first
# byte NOT in this byteset, as this is where we can no longer skip ahead to
byteset = ~ScanByte.ByteSet(bs)
bsym = gensym()
quote
# We wrap this in an Automa function, because otherwise the generated code
# would have a reference to ScanByte, which the user may not have imported.
# But they surely have imported Automa.
$bsym = Automa.loop_simd(
$(ctx.vars.mem).ptr + $(ctx.vars.p) - 1,
($(ctx.vars.p_end) - $(ctx.vars.p) + 1) % UInt,
Val($byteset)
)
$(ctx.vars.p) = if $bsym === nothing
$(ctx.vars.p_end) + 1
function generate_simd_loop(ctx::CodeGenContext, bs::ByteSet, state::Int)
vsym = gensym()
rsym = gensym()
block = Expr(:block)
for range in range_encode(bs)
expr = if length(range) == 1
quote
$(rsym) |= ($(vsym) == $(first(range)))
end
else
$(ctx.vars.p) + $bsym - 1
quote
$(rsym) |= ($(vsym) >= $(first(range))) & ($(vsym) <= $(last(range)))
end
end
append_code!(block, expr)
end
quote
while $(ctx.vars.p) + 30 < $(ctx.vars.p_end)
$(vsym) = @inbounds $(SIMD.vload)($(Vec{32, UInt8}), $(ctx.vars.mem).ptr + $(ctx.vars.p) - 1, nothing, Val(false), Val(false))
$(rsym) = $(Vec{32, Bool})(false)
$(block)
all($(rsym)) || break
$(ctx.vars.p) += 32
end
while true
if $(ctx.vars.p) > $(ctx.vars.p_end)
$(ctx.vars.cs) = $(state)
@goto exit
end
$(ctx.vars.byte) = @inbounds getindex($(ctx.vars.mem), $(ctx.vars.p))
($(generate_membership_code(ctx.vars.byte, bs))) || break
$(ctx.vars.p) += 1
end
end
end

# Necessary wrapper function, see comment in `generate_simd_loop`
@inline function loop_simd(ptr::Ptr, len::UInt, valbs::Val)
ScanByte.memchr(ptr, len, valbs)
end
=#

# Make if/else statements for each state that is an acceptable end state, and execute
# the actions attached with ending in this state.
function generate_eof_action_code(ctx::CodeGenContext, machine::Machine, actions::Dict{Symbol,Expr})
Expand Down

0 comments on commit 3ab134c

Please sign in to comment.