From 2def80568b61b794ee65ade8a866eca7b6ff7e5f Mon Sep 17 00:00:00 2001 From: "Eloy Lafuente (stronk7)" Date: Sun, 6 Mar 2022 16:30:20 +0100 Subject: [PATCH 1/3] MDL-74097 core_text: Ensure that transliteration always happens Before the patch, transliteration was only happening when the encoding of the string was utf-8. For other encodings only a simpler conversion (iconv) to ascii was done. For some reason iconv() own transliteration abilities are not consistent between systems (depends of libraries installed, locales and other bits). So now we always convert the string to utf-8, in order to transliterate it. And finally, also perform an iconv to cover some characters that transliterate doesn't handle ok. Also, remove a block of code that does nothing (previously it was executing some code, but now it just sets and restores the error level for nothing). --- lib/classes/text.php | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/lib/classes/text.php b/lib/classes/text.php index f95d2b55c51fb..9402863a7d54d 100644 --- a/lib/classes/text.php +++ b/lib/classes/text.php @@ -158,21 +158,13 @@ public static function convert($text, $fromCS, $toCS='utf-8') { } if ($toCS === 'ascii') { - // Try to normalize the conversion a bit. - $text = self::specialtoascii($text, $fromCS); + // Try to normalize the conversion a bit if the target is ascii. + return self::specialtoascii($text, $fromCS); } // Prevent any error notices, do not use //IGNORE so that we get // consistent result if iconv fails. - $result = @iconv($fromCS, $toCS.'//TRANSLIT', $text); - - if ($result === false or $result === '') { - // Note: iconv is prone to return empty string when invalid char encountered, or false if encoding unsupported. - $oldlevel = error_reporting(E_PARSE); - error_reporting($oldlevel); - } - - return $result; + return @iconv($fromCS, $toCS.'//TRANSLIT', $text); } /** @@ -341,10 +333,14 @@ public static function specialtoascii($text, $charset='utf-8') { $charset = self::parse_charset($charset); $oldlevel = error_reporting(E_PARSE); - if ($charset == 'utf-8') { - $text = transliterator_transliterate('Any-Latin; Latin-ASCII', (string) $text); + // Always convert to utf-8, so transliteration can do its work always. + if ($charset !== 'utf-8') { + $text = iconv($charset, 'utf-8'.'//TRANSLIT', $text); } - $result = iconv($charset, 'ASCII//TRANSLIT//IGNORE', (string) $text); + $text = transliterator_transliterate('Any-Latin; Latin-ASCII', (string) $text); + + // Still, apply iconv because some chars are not handled by transliterate. + $result = iconv('utf-8', 'ASCII//TRANSLIT//IGNORE', (string) $text); error_reporting($oldlevel); return $result; From 3255c42bf47a6e2ca338df561c4613899e75c955 Mon Sep 17 00:00:00 2001 From: "Eloy Lafuente (stronk7)" Date: Sun, 6 Mar 2022 16:33:49 +0100 Subject: [PATCH 2/3] MDL-74097 core_text: Add coverage info to better see what's missing --- lib/tests/text_test.php | 47 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/lib/tests/text_test.php b/lib/tests/text_test.php index 36e4923ec9c76..8e795816bb7e8 100644 --- a/lib/tests/text_test.php +++ b/lib/tests/text_test.php @@ -33,11 +33,15 @@ * @category phpunit * @copyright 2010 Petr Skoda (http://skodak.org) * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later + * @coversDefaultClass \core_text + * */ class text_test extends advanced_testcase { /** * Tests the static parse charset method. + * + * @covers ::parse_charset() */ public function test_parse_charset() { $this->assertSame('windows-1250', core_text::parse_charset('Cp1250')); @@ -47,6 +51,8 @@ public function test_parse_charset() { /** * Tests the static convert method. + * + * @covers ::convert() */ public function test_convert() { $utf8 = "Žluťoučký koníček"; @@ -103,6 +109,8 @@ public function test_convert() { /** * Tests the static sub string method. + * + * @covers ::substr() */ public function test_substr() { $str = "Žluťoučký koníček"; @@ -145,6 +153,8 @@ public function test_substr() { /** * Tests the static string length method. + * + * @covers ::strlen() */ public function test_strlen() { $str = "Žluťoučký koníček"; @@ -184,6 +194,8 @@ public function test_strlen() { /** * Test unicode safe string truncation. + * + * @covers ::str_max_bytes() */ public function test_str_max_bytes() { // These are all 3 byte characters, so this is a 12-byte string. @@ -234,6 +246,8 @@ public function test_str_max_bytes() { /** * Tests the static strtolower method. + * + * @covers ::strtolower() */ public function test_strtolower() { $str = "Žluťoučký koníček"; @@ -270,6 +284,8 @@ public function test_strtolower() { /** * Tests the static strtoupper. + * + * @covers ::strtoupper() */ public function test_strtoupper() { $str = "Žluťoučký koníček"; @@ -303,6 +319,8 @@ public function test_strtoupper() { /** * Test the strrev method. + * + * @covers ::strrev() */ public function test_strrev() { $strings = array( @@ -323,6 +341,8 @@ public function test_strrev() { /** * Tests the static strpos method. + * + * @covers ::strpos() */ public function test_strpos() { $str = "Žluťoučký koníček"; @@ -331,6 +351,8 @@ public function test_strpos() { /** * Tests the static strrpos. + * + * @covers ::strrpos() */ public function test_strrpos() { $str = "Žluťoučký koníček"; @@ -339,6 +361,8 @@ public function test_strrpos() { /** * Tests the static specialtoascii method. + * + * @covers ::specialtoascii() */ public function test_specialtoascii() { $str = "Žluťoučký koníček"; @@ -349,7 +373,9 @@ public function test_specialtoascii() { /** * Tests the static encode_mimeheader method. - * This also tests method moodle_phpmailer::encodeHeader that calls core_text::encode_mimeheader + * + * @covers ::encode_mimeheader() + * @covers \moodle_phpmailer::encodeHeader() */ public function test_encode_mimeheader() { global $CFG; @@ -388,6 +414,8 @@ public function test_encode_mimeheader() { /** * Tests the static entities_to_utf8 method. + * + * @covers ::entities_to_utf8() */ public function test_entities_to_utf8() { $str = "Žluťoučký koníček©"&<>§«"; @@ -396,6 +424,8 @@ public function test_entities_to_utf8() { /** * Tests the static utf8_to_entities method. + * + * @covers ::utf8_to_entities() */ public function test_utf8_to_entities() { $str = "Žluťoučký koníček©"&<>§«"; @@ -409,6 +439,8 @@ public function test_utf8_to_entities() { /** * Tests the static trim_utf8_bom method. + * + * @covers ::trim_utf8_bom() */ public function test_trim_utf8_bom() { $bom = "\xef\xbb\xbf"; @@ -418,6 +450,8 @@ public function test_trim_utf8_bom() { /** * Tests the static remove_unicode_non_characters method. + * + * @covers ::remove_unicode_non_characters() */ public function test_remove_unicode_non_characters() { // Confirm that texts which don't contain these characters are unchanged. @@ -439,6 +473,8 @@ public function test_remove_unicode_non_characters() { /** * Tests the static get_encodings method. + * + * @covers ::get_encodings() */ public function test_get_encodings() { $encodings = core_text::get_encodings(); @@ -449,6 +485,8 @@ public function test_get_encodings() { /** * Tests the static code2utf8 method. + * + * @covers ::code2utf8() */ public function test_code2utf8() { $this->assertSame('Ž', core_text::code2utf8(381)); @@ -456,6 +494,8 @@ public function test_code2utf8() { /** * Tests the static utf8ord method. + * + * @covers ::utf8ord() */ public function test_utf8ord() { $this->assertSame(ord(''), core_text::utf8ord('')); @@ -468,6 +508,8 @@ public function test_utf8ord() { /** * Tests the static strtotitle method. + * + * @covers ::strtotitle() */ public function test_strtotitle() { $str = "žluťoučký koníček"; @@ -476,6 +518,8 @@ public function test_strtotitle() { /** * Test strrchr. + * + * @covers ::strrchr() */ public function test_strrchr() { $str = "Žluťoučký koníček"; @@ -491,6 +535,7 @@ public function test_strrchr() { * @dataProvider is_charset_supported_provider() * @param string $charset * @param bool $expected + * @covers ::is_charset_supported() */ public function test_is_charset_supported(string $charset, bool $expected) { $charset = core_text::parse_charset($charset); From 1af724e105a3a6589093394bc84a8cf6e470ef65 Mon Sep 17 00:00:00 2001 From: "Eloy Lafuente (stronk7)" Date: Sun, 6 Mar 2022 16:48:46 +0100 Subject: [PATCH 3/3] MDL-74097 core_text: Add some tests to cover all the logic --- lib/tests/text_test.php | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lib/tests/text_test.php b/lib/tests/text_test.php index 8e795816bb7e8..e392865c6aa56 100644 --- a/lib/tests/text_test.php +++ b/lib/tests/text_test.php @@ -55,6 +55,7 @@ public function test_parse_charset() { * @covers ::convert() */ public function test_convert() { + $this->assertSame('', core_text::convert('', 'utf-8', 'utf-8')); $utf8 = "Žluťoučký koníček"; $iso2 = pack("H*", "ae6c75bb6f75e86bfd206b6f6eede8656b"); $win = pack("H*", "8e6c759d6f75e86bfd206b6f6eede8656b"); @@ -367,8 +368,20 @@ public function test_strrpos() { public function test_specialtoascii() { $str = "Žluťoučký koníček"; $this->assertSame('Zlutoucky konicek', core_text::specialtoascii($str)); + $utf8 = "Der eine stößt den Speer zum Mann"; + $iso1 = core_text::convert($utf8, 'utf-8', 'iso-8859-1'); $this->assertSame('Der eine stosst den Speer zum Mann', core_text::specialtoascii($utf8)); + $this->assertSame('Der eine stosst den Speer zum Mann', core_text::specialtoascii($iso1, 'iso-8859-1')); + + $str = 'àáâãäçèéêëìíîïñòóôõöùúûüýÿÀÁÂÃÄÇÈÉÊËÌÍÎÏÑÒÓÔÕÖÙÚÛÜÝ'; + $this->assertSame('aaaaaceeeeiiiinooooouuuuyyAAAAACEEEEIIIINOOOOOUUUUY', core_text::specialtoascii($str)); + + $utf8 = 'A æ Übérmensch på høyeste nivå! И я люблю PHP! есть. fi'; + $this->assertSame('A ae Ubermensch pa hoyeste niva! I a lublu PHP! est\'. fi', core_text::specialtoascii($utf8, 'utf8')); + + $utf8 = 'キャンパス Αλφαβητικός Κατάλογος Лорем ипсум долор сит амет'; + $this->assertSame('kyanpasu Alphabetikos Katalogos Lorem ipsum dolor sit amet', core_text::specialtoascii($utf8)); } /**