From 402512e1a59a46bdc20321fea7b52cc7e7b78976 Mon Sep 17 00:00:00 2001 From: Hibariya Date: Mon, 18 Dec 2023 09:17:25 +0000 Subject: [PATCH 1/2] Skip if the charset is non-utf-8 --- lib/rack/utf8_sanitizer.rb | 5 +++++ test/test_utf8_sanitizer.rb | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/lib/rack/utf8_sanitizer.rb b/lib/rack/utf8_sanitizer.rb index eea3f12..5e50424 100644 --- a/lib/rack/utf8_sanitizer.rb +++ b/lib/rack/utf8_sanitizer.rb @@ -2,6 +2,7 @@ require 'uri' require 'stringio' +require 'rack/request' module Rack class UTF8Sanitizer @@ -126,6 +127,10 @@ def sanitize_rack_input(env) end end return unless @sanitizable_content_types.any? {|type| content_type == type } + + charset = Rack::Request.new(env).content_charset + return if charset && charset.downcase != 'utf-8' + uri_encoded = URI_ENCODED_CONTENT_TYPES.any? {|type| content_type == type} if env['rack.input'] diff --git a/test/test_utf8_sanitizer.rb b/test/test_utf8_sanitizer.rb index f0240c2..cb055c8 100644 --- a/test/test_utf8_sanitizer.rb +++ b/test/test_utf8_sanitizer.rb @@ -252,6 +252,18 @@ def read end end + it "sanitizes the rack body if the charset is present and utf-8" do + input = "name=#{CGI.escape("まつもと")}" + @rack_input = StringIO.new input + + env = request_env.update('CONTENT_TYPE' => "application/x-www-form-urlencoded; charset=utf-8") + sanitize_form_data(env) do |sanitized_input| + sanitized_input.encoding.should == Encoding::UTF_8 + sanitized_input.should.be.valid_encoding + sanitized_input.should == input + end + end + it "strip UTF-8 BOM from StringIO rack.input" do input = %(\xef\xbb\xbf{"Hello": "World"}) @rack_input = StringIO.new input @@ -327,6 +339,18 @@ def read end end + it "does not sanitize the rack body if the charset is present and not utf-8" do + input = "name=".encode("Shift_JIS") + CGI.escape("まつもと".encode("Shift_JIS", "UTF-8")) + @rack_input = StringIO.new input + + env = request_env.update('CONTENT_TYPE' => "application/x-www-form-urlencoded; charset=Shift_JIS") + sanitize_form_data(env) do |sanitized_input| + sanitized_input.encoding.should == Encoding::SHIFT_JIS + sanitized_input.should.be.valid_encoding + sanitized_input.should == input + end + end + it "adjusts content-length when replacing input" do input = "foo=bla&quux=bar\xED" @rack_input = StringIO.new input From e8111c5c6fc9ba9dca92e4cba86fa37f8f1b5d75 Mon Sep 17 00:00:00 2001 From: Hibariya Date: Mon, 18 Dec 2023 10:03:27 +0000 Subject: [PATCH 2/2] Rack::Request#media_type is available now --- lib/rack/utf8_sanitizer.rb | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/lib/rack/utf8_sanitizer.rb b/lib/rack/utf8_sanitizer.rb index 5e50424..cc5a3d1 100644 --- a/lib/rack/utf8_sanitizer.rb +++ b/lib/rack/utf8_sanitizer.rb @@ -116,19 +116,11 @@ def build_strategy(options) end def sanitize_rack_input(env) - # https://github.com/rack/rack/blob/master/lib/rack/request.rb#L42 - # Logic borrowed from Rack::Request#media_type,#media_type_params,#content_charset - # Ignoring charset in content type. - if content_type = env['CONTENT_TYPE'] - content_type = content_type.split(/[;,]/, 2).first - if content_type - content_type.strip! - content_type.downcase! - end - end + request = Rack::Request.new(env) + content_type = request.media_type return unless @sanitizable_content_types.any? {|type| content_type == type } - charset = Rack::Request.new(env).content_charset + charset = request.content_charset return if charset && charset.downcase != 'utf-8' uri_encoded = URI_ENCODED_CONTENT_TYPES.any? {|type| content_type == type}