Skip to content

Commit

Permalink
Remove '\u0000' from input when sanitizing null input
Browse files Browse the repository at this point in the history
  • Loading branch information
such committed Oct 28, 2024
1 parent f2bcf86 commit 20d0065
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 54 deletions.
111 changes: 57 additions & 54 deletions lib/rack/utf8_sanitizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@ module Rack
class UTF8Sanitizer
StringIO = ::StringIO
NULL_BYTE_REGEX = /\x00/.freeze
NULL_BYTE_STRING_REGEX = Regexp.new('\\\u0000').freeze

class NullByteInString < StandardError; end

# options[:sanitizable_content_types] Array
# options[:additional_content_types] Array
def initialize(app, options={})
def initialize(app, options = {})
@app = app
@strategy = build_strategy(options)
@sanitizable_content_types = options[:sanitizable_content_types]
Expand All @@ -27,55 +28,54 @@ def call(env)
begin
env = sanitize(env)
rescue EOFError
return [400, { "Content-Type" => "text/plain" }, ["Bad Request"]]
return [400, { 'Content-Type' => 'text/plain' }, ['Bad Request']]
end
@app.call(env)
end

DEFAULT_STRATEGIES = {
replace: lambda do |input, sanitize_null_bytes: false|
input.
force_encoding(Encoding::ASCII_8BIT).
encode!(Encoding::UTF_8,
invalid: :replace,
undef: :replace)
if sanitize_null_bytes
input = input.gsub(NULL_BYTE_REGEX, "")
end
input
.force_encoding(Encoding::ASCII_8BIT)
.encode!(Encoding::UTF_8,
invalid: :replace,
undef: :replace)
input = input.gsub(NULL_BYTE_REGEX, '').gsub(NULL_BYTE_STRING_REGEX, '') if sanitize_null_bytes
input
end,
exception: lambda do |input, sanitize_null_bytes: false|
input.
force_encoding(Encoding::ASCII_8BIT).
encode!(Encoding::UTF_8)
if sanitize_null_bytes && NULL_BYTE_REGEX.match?(input)
input
.force_encoding(Encoding::ASCII_8BIT)
.encode!(Encoding::UTF_8)
if sanitize_null_bytes && (NULL_BYTE_REGEX.match?(input) || NULL_BYTE_STRING_REGEX.match?(input))
raise NullByteInString
end

input
end
}.freeze

# https://github.com/rack/rack/blob/main/SPEC.rdoc
URI_FIELDS = %w(
SCRIPT_NAME
REQUEST_PATH REQUEST_URI PATH_INFO
QUERY_STRING
HTTP_REFERER
ORIGINAL_FULLPATH
ORIGINAL_SCRIPT_NAME
SERVER_NAME
).map(&:freeze).freeze

SANITIZABLE_CONTENT_TYPES = %w(
URI_FIELDS = %w[
SCRIPT_NAME
REQUEST_PATH REQUEST_URI PATH_INFO
QUERY_STRING
HTTP_REFERER
ORIGINAL_FULLPATH
ORIGINAL_SCRIPT_NAME
SERVER_NAME
].map(&:freeze).freeze

SANITIZABLE_CONTENT_TYPES = %w[
text/plain
application/x-www-form-urlencoded
application/json
text/javascript
).map(&:freeze).freeze
].map(&:freeze).freeze

URI_ENCODED_CONTENT_TYPES = %w(
URI_ENCODED_CONTENT_TYPES = %w[
application/x-www-form-urlencoded
).map(&:freeze).freeze
].map(&:freeze).freeze

HTTP_ = 'HTTP_'.freeze

Expand All @@ -86,13 +86,11 @@ def sanitize(env)
next if skip?(key)

if URI_FIELDS.include?(key)
env[key] = transfer_frozen(value,
sanitize_uri_encoded_string(value))
env[key] = transfer_frozen(value, sanitize_uri_encoded_string(value))
elsif key.to_s.start_with?(HTTP_)
# Just sanitize the headers and leave them in UTF-8. There is
# no reason to have UTF-8 in headers, but if it's valid, let it be.
env[key] = transfer_frozen(value,
sanitize_string(value))
env[key] = transfer_frozen(value, sanitize_string(value))
end
end
end
Expand All @@ -117,19 +115,19 @@ def build_strategy(options)
def sanitize_rack_input(env)
request = Rack::Request.new(env)
content_type = request.media_type
return unless @sanitizable_content_types.any? {|type| content_type == type }
return unless @sanitizable_content_types.any? { |type| content_type == type }

charset = request.content_charset
return if charset && charset.downcase != 'utf-8'

uri_encoded = URI_ENCODED_CONTENT_TYPES.any? {|type| content_type == type}
uri_encoded = URI_ENCODED_CONTENT_TYPES.any? { |type| content_type == type }

if env['rack.input']
sanitized_input = sanitize_io(env['rack.input'], uri_encoded, env['CONTENT_LENGTH']&.to_i)
return unless env['rack.input']

env['rack.input'] = sanitized_input
env['CONTENT_LENGTH'] &&= sanitized_input.size.to_s
end
sanitized_input = sanitize_io(env['rack.input'], uri_encoded, env['CONTENT_LENGTH']&.to_i)

env['rack.input'] = sanitized_input
env['CONTENT_LENGTH'] &&= sanitized_input.size.to_s
end

# Modeled after Rack::RewindableInput
Expand Down Expand Up @@ -169,14 +167,14 @@ def close

def sanitize_io(io, uri_encoded = false, content_length = nil)
input = if content_length && content_length >= 0
io.read(content_length)
else
io.read
end
io.read(content_length)
else
io.read
end
sanitized_input = sanitize_string(strip_byte_order_mark(input))
if uri_encoded
sanitized_input = sanitize_uri_encoded_string(sanitized_input).
force_encoding(Encoding::UTF_8)
sanitized_input = sanitize_uri_encoded_string(sanitized_input)
.force_encoding(Encoding::UTF_8)
end
sanitized_input = transfer_frozen(input, sanitized_input)
SanitizedRackInput.new(io, StringIO.new(sanitized_input))
Expand All @@ -191,9 +189,9 @@ def sanitize_cookies(env)
return unless env['HTTP_COOKIE']

env['HTTP_COOKIE'] = env['HTTP_COOKIE']
.split(/[;,] */n)
.map { |cookie| sanitize_uri_encoded_string(cookie) }
.join('; ')
.split(/[;,] */n)
.map { |cookie| sanitize_uri_encoded_string(cookie) }
.join('; ')
end

# URI.encode/decode expect the input to be in ASCII-8BIT.
Expand All @@ -206,19 +204,22 @@ def sanitize_cookies(env)
# The result is guaranteed to be UTF-8-safe.
def sanitize_uri_encoded_string(input)
return input if input.nil?

decoded_value = decode_string(input)
reencode_string(decoded_value)
end

def reencode_string(decoded_value)
escape_unreserved(
sanitize_string(decoded_value))
sanitize_string(decoded_value)
)
end

def decode_string(input)
unescape_unreserved(
sanitize_string(input).
force_encoding(Encoding::ASCII_8BIT))
sanitize_string(input)
.force_encoding(Encoding::ASCII_8BIT)
)
end

# This regexp matches all 'unreserved' characters from RFC3986 (2.3),
Expand All @@ -233,7 +234,7 @@ def decode_string(input)
# enough for our task.
def unescape_unreserved(input)
input.gsub(/%([a-f\d]{2})/i) do |encoded|
decoded = $1.hex.chr
decoded = ::Regexp.last_match(1).hex.chr

decodable_regex = @sanitize_null_bytes ? UNRESERVED_OR_UTF8_OR_NULL : UNRESERVED_OR_UTF8
if decoded =~ decodable_regex
Expand All @@ -250,7 +251,7 @@ def unescape_unreserved(input)
# `unescape_unreserved` invocation.
#
# See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}.
UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/
UNSAFE = %r{[^\-_.!~*'()a-zA-Z\d;/?:@&=+$,\[\]%]}.freeze

# Performs the reverse function of `unescape_unreserved`. Unlike
# the previous function, we can reuse the logic in URI#encode
Expand All @@ -262,7 +263,8 @@ def sanitize_string(input)
if input.is_a? String
input = input.dup.force_encoding(Encoding::UTF_8)

if input.valid_encoding? && !(@sanitize_null_bytes && input =~ NULL_BYTE_REGEX)
if input.valid_encoding? &&
!(@sanitize_null_bytes && (NULL_BYTE_REGEX.match?(input) || NULL_BYTE_STRING_REGEX.match?(input)))
input
else
@strategy.call(input, sanitize_null_bytes: @sanitize_null_bytes)
Expand All @@ -285,6 +287,7 @@ def transfer_frozen(from, to)

def strip_byte_order_mark(input)
return input unless input.start_with?(UTF8_BOM)

input.byteslice(UTF8_BOM_SIZE..-1)
end
end
Expand Down
12 changes: 12 additions & 0 deletions test/test_utf8_sanitizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,18 @@ def read
end
end

it "optionally sanitizes null bytes plain string with the replace strategy" do
@app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true)
input = "foo=bla\xED&quux=bar" + '\u0000'
@rack_input = StringIO.new input

sanitize_form_data do |sanitized_input|
sanitized_input.encoding.should == Encoding::UTF_8
sanitized_input.should.be.valid_encoding
sanitized_input.should == "foo=bla%EF%BF%BD&quux=bar"
end
end

it "optionally sanitizes encoded null bytes with the replace strategy" do
@app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true)
input = "foo=bla%ED&quux=bar%00"
Expand Down

0 comments on commit 20d0065

Please sign in to comment.