diff --git a/lib/rack/utf8_sanitizer.rb b/lib/rack/utf8_sanitizer.rb index e36185d..cf57578 100644 --- a/lib/rack/utf8_sanitizer.rb +++ b/lib/rack/utf8_sanitizer.rb @@ -8,12 +8,13 @@ module Rack class UTF8Sanitizer StringIO = ::StringIO NULL_BYTE_REGEX = /\x00/.freeze + NULL_BYTE_STRING_REGEX = Regexp.new('\\\u0000').freeze class NullByteInString < StandardError; end # options[:sanitizable_content_types] Array # options[:additional_content_types] Array - def initialize(app, options={}) + def initialize(app, options = {}) @app = app @strategy = build_strategy(options) @sanitizable_content_types = options[:sanitizable_content_types] @@ -27,55 +28,54 @@ def call(env) begin env = sanitize(env) rescue EOFError - return [400, { "Content-Type" => "text/plain" }, ["Bad Request"]] + return [400, { 'Content-Type' => 'text/plain' }, ['Bad Request']] end @app.call(env) end DEFAULT_STRATEGIES = { replace: lambda do |input, sanitize_null_bytes: false| - input. - force_encoding(Encoding::ASCII_8BIT). - encode!(Encoding::UTF_8, - invalid: :replace, - undef: :replace) - if sanitize_null_bytes - input = input.gsub(NULL_BYTE_REGEX, "") - end + input + .force_encoding(Encoding::ASCII_8BIT) + .encode!(Encoding::UTF_8, + invalid: :replace, + undef: :replace) + input = input.gsub(NULL_BYTE_REGEX, '').gsub(NULL_BYTE_STRING_REGEX, '') if sanitize_null_bytes input end, exception: lambda do |input, sanitize_null_bytes: false| - input. - force_encoding(Encoding::ASCII_8BIT). - encode!(Encoding::UTF_8) - if sanitize_null_bytes && NULL_BYTE_REGEX.match?(input) + input + .force_encoding(Encoding::ASCII_8BIT) + .encode!(Encoding::UTF_8) + if sanitize_null_bytes && (NULL_BYTE_REGEX.match?(input) || NULL_BYTE_STRING_REGEX.match?(input)) raise NullByteInString end + input end }.freeze # https://github.com/rack/rack/blob/main/SPEC.rdoc - URI_FIELDS = %w( - SCRIPT_NAME - REQUEST_PATH REQUEST_URI PATH_INFO - QUERY_STRING - HTTP_REFERER - ORIGINAL_FULLPATH - ORIGINAL_SCRIPT_NAME - SERVER_NAME - ).map(&:freeze).freeze - - SANITIZABLE_CONTENT_TYPES = %w( + URI_FIELDS = %w[ + SCRIPT_NAME + REQUEST_PATH REQUEST_URI PATH_INFO + QUERY_STRING + HTTP_REFERER + ORIGINAL_FULLPATH + ORIGINAL_SCRIPT_NAME + SERVER_NAME + ].map(&:freeze).freeze + + SANITIZABLE_CONTENT_TYPES = %w[ text/plain application/x-www-form-urlencoded application/json text/javascript - ).map(&:freeze).freeze + ].map(&:freeze).freeze - URI_ENCODED_CONTENT_TYPES = %w( + URI_ENCODED_CONTENT_TYPES = %w[ application/x-www-form-urlencoded - ).map(&:freeze).freeze + ].map(&:freeze).freeze HTTP_ = 'HTTP_'.freeze @@ -86,13 +86,11 @@ def sanitize(env) next if skip?(key) if URI_FIELDS.include?(key) - env[key] = transfer_frozen(value, - sanitize_uri_encoded_string(value)) + env[key] = transfer_frozen(value, sanitize_uri_encoded_string(value)) elsif key.to_s.start_with?(HTTP_) # Just sanitize the headers and leave them in UTF-8. There is # no reason to have UTF-8 in headers, but if it's valid, let it be. - env[key] = transfer_frozen(value, - sanitize_string(value)) + env[key] = transfer_frozen(value, sanitize_string(value)) end end end @@ -117,19 +115,19 @@ def build_strategy(options) def sanitize_rack_input(env) request = Rack::Request.new(env) content_type = request.media_type - return unless @sanitizable_content_types.any? {|type| content_type == type } + return unless @sanitizable_content_types.any? { |type| content_type == type } charset = request.content_charset return if charset && charset.downcase != 'utf-8' - uri_encoded = URI_ENCODED_CONTENT_TYPES.any? {|type| content_type == type} + uri_encoded = URI_ENCODED_CONTENT_TYPES.any? { |type| content_type == type } - if env['rack.input'] - sanitized_input = sanitize_io(env['rack.input'], uri_encoded, env['CONTENT_LENGTH']&.to_i) + return unless env['rack.input'] - env['rack.input'] = sanitized_input - env['CONTENT_LENGTH'] &&= sanitized_input.size.to_s - end + sanitized_input = sanitize_io(env['rack.input'], uri_encoded, env['CONTENT_LENGTH']&.to_i) + + env['rack.input'] = sanitized_input + env['CONTENT_LENGTH'] &&= sanitized_input.size.to_s end # Modeled after Rack::RewindableInput @@ -169,14 +167,14 @@ def close def sanitize_io(io, uri_encoded = false, content_length = nil) input = if content_length && content_length >= 0 - io.read(content_length) - else - io.read - end + io.read(content_length) + else + io.read + end sanitized_input = sanitize_string(strip_byte_order_mark(input)) if uri_encoded - sanitized_input = sanitize_uri_encoded_string(sanitized_input). - force_encoding(Encoding::UTF_8) + sanitized_input = sanitize_uri_encoded_string(sanitized_input) + .force_encoding(Encoding::UTF_8) end sanitized_input = transfer_frozen(input, sanitized_input) SanitizedRackInput.new(io, StringIO.new(sanitized_input)) @@ -191,9 +189,9 @@ def sanitize_cookies(env) return unless env['HTTP_COOKIE'] env['HTTP_COOKIE'] = env['HTTP_COOKIE'] - .split(/[;,] */n) - .map { |cookie| sanitize_uri_encoded_string(cookie) } - .join('; ') + .split(/[;,] */n) + .map { |cookie| sanitize_uri_encoded_string(cookie) } + .join('; ') end # URI.encode/decode expect the input to be in ASCII-8BIT. @@ -206,19 +204,22 @@ def sanitize_cookies(env) # The result is guaranteed to be UTF-8-safe. def sanitize_uri_encoded_string(input) return input if input.nil? + decoded_value = decode_string(input) reencode_string(decoded_value) end def reencode_string(decoded_value) escape_unreserved( - sanitize_string(decoded_value)) + sanitize_string(decoded_value) + ) end def decode_string(input) unescape_unreserved( - sanitize_string(input). - force_encoding(Encoding::ASCII_8BIT)) + sanitize_string(input) + .force_encoding(Encoding::ASCII_8BIT) + ) end # This regexp matches all 'unreserved' characters from RFC3986 (2.3), @@ -233,7 +234,7 @@ def decode_string(input) # enough for our task. def unescape_unreserved(input) input.gsub(/%([a-f\d]{2})/i) do |encoded| - decoded = $1.hex.chr + decoded = ::Regexp.last_match(1).hex.chr decodable_regex = @sanitize_null_bytes ? UNRESERVED_OR_UTF8_OR_NULL : UNRESERVED_OR_UTF8 if decoded =~ decodable_regex @@ -250,7 +251,7 @@ def unescape_unreserved(input) # `unescape_unreserved` invocation. # # See also URI::REGEXP::PATTERN::{UNRESERVED,RESERVED}. - UNSAFE = /[^\-_.!~*'()a-zA-Z\d;\/?:@&=+$,\[\]%]/ + UNSAFE = %r{[^\-_.!~*'()a-zA-Z\d;/?:@&=+$,\[\]%]}.freeze # Performs the reverse function of `unescape_unreserved`. Unlike # the previous function, we can reuse the logic in URI#encode @@ -262,7 +263,8 @@ def sanitize_string(input) if input.is_a? String input = input.dup.force_encoding(Encoding::UTF_8) - if input.valid_encoding? && !(@sanitize_null_bytes && input =~ NULL_BYTE_REGEX) + if input.valid_encoding? && + !(@sanitize_null_bytes && (NULL_BYTE_REGEX.match?(input) || NULL_BYTE_STRING_REGEX.match?(input))) input else @strategy.call(input, sanitize_null_bytes: @sanitize_null_bytes) @@ -285,6 +287,7 @@ def transfer_frozen(from, to) def strip_byte_order_mark(input) return input unless input.start_with?(UTF8_BOM) + input.byteslice(UTF8_BOM_SIZE..-1) end end diff --git a/test/test_utf8_sanitizer.rb b/test/test_utf8_sanitizer.rb index e3f4f87..0490fdd 100644 --- a/test/test_utf8_sanitizer.rb +++ b/test/test_utf8_sanitizer.rb @@ -395,6 +395,18 @@ def read end end + it "optionally sanitizes null bytes plain string with the replace strategy" do + @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true) + input = "foo=bla\xED&quux=bar" + '\u0000' + @rack_input = StringIO.new input + + sanitize_form_data do |sanitized_input| + sanitized_input.encoding.should == Encoding::UTF_8 + sanitized_input.should.be.valid_encoding + sanitized_input.should == "foo=bla%EF%BF%BD&quux=bar" + end + end + it "optionally sanitizes encoded null bytes with the replace strategy" do @app = Rack::UTF8Sanitizer.new(-> env { env }, sanitize_null_bytes: true) input = "foo=bla%ED&quux=bar%00"