From 6e52a1237eaba36c76caecfcff44b7a55ce4ae74 Mon Sep 17 00:00:00 2001 From: PyYoshi Date: Tue, 11 Jun 2024 19:44:25 +0900 Subject: [PATCH] =?UTF-8?q?refactor:=20uchardet=E3=81=AE=E3=83=86=E3=82=B9?= =?UTF-8?q?=E3=83=88=E3=83=87=E3=83=BC=E3=82=BF=E3=82=92=E5=88=A9=E7=94=A8?= =?UTF-8?q?=E3=81=97=E3=81=9F=E3=83=86=E3=82=B9=E3=83=88=E3=81=B8=E5=A4=89?= =?UTF-8?q?=E6=9B=B4=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_1.py | 84 +++++++++++++++---------- tests/testdata/ar/iso-8859-6.txt | 3 - tests/testdata/ar/utf-8.txt | 3 - tests/testdata/ar/windows-1256.txt | 3 - tests/testdata/bg/windows-1251.txt | 3 - tests/testdata/cs/ibm852.txt | 4 -- tests/testdata/cs/iso-8859-2.txt | 4 -- tests/testdata/cs/maccentraleurope.txt | 4 -- tests/testdata/cs/utf-8.txt | 4 -- tests/testdata/cs/windows-1250.txt | 4 -- tests/testdata/da/iso-8859-1.txt | 7 --- tests/testdata/da/iso-8859-15.txt | 10 --- tests/testdata/da/utf-8.txt | 10 --- tests/testdata/da/windows-1252.txt | 10 --- tests/testdata/de/iso-8859-1.txt | 11 ---- tests/testdata/de/windows-1252.txt | 11 ---- tests/testdata/el/iso-8859-7.txt | 3 - tests/testdata/el/utf-8.txt | 3 - tests/testdata/el/windows-1253.txt | 5 -- tests/testdata/en/ascii.txt | 4 -- tests/testdata/eo/iso-8859-3.txt | 7 --- tests/testdata/es/iso-8859-1.txt | 5 -- tests/testdata/es/iso-8859-15.txt | 5 -- tests/testdata/es/utf-8.txt | 5 -- tests/testdata/es/windows-1252.txt | 5 -- tests/testdata/et/iso-8859-13.txt | 6 -- tests/testdata/et/iso-8859-15.txt | 6 -- tests/testdata/et/iso-8859-4.txt | 6 -- tests/testdata/et/utf-8.txt | 6 -- tests/testdata/et/windows-1252.txt | 6 -- tests/testdata/et/windows-1257.txt | 6 -- tests/testdata/fi/iso-8859-1.txt | 8 --- tests/testdata/fi/utf-8.txt | 8 --- tests/testdata/fr/iso-8859-1.txt | 5 -- tests/testdata/fr/iso-8859-15.txt | 16 ----- tests/testdata/fr/utf-16.be | Bin 1080 -> 0 bytes tests/testdata/fr/utf-32.le | Bin 1356 -> 0 bytes tests/testdata/fr/utf-8.txt | 14 ----- tests/testdata/fr/windows-1252.txt | 3 - tests/testdata/ga/iso-8859-1.txt | 6 -- tests/testdata/ga/utf-8.txt | 6 -- tests/testdata/ga/windows-1252.txt | 6 -- tests/testdata/he/iso-8859-8.txt | 2 - tests/testdata/he/utf-8.txt | 3 - tests/testdata/he/windows-1255.txt | 1 - tests/testdata/hr/ibm852.txt | 4 -- tests/testdata/hr/iso-8859-13.txt | 4 -- tests/testdata/hr/iso-8859-16.txt | 4 -- tests/testdata/hr/iso-8859-2.txt | 4 -- tests/testdata/hr/maccentraleurope.txt | 4 -- tests/testdata/hr/utf-8.txt | 4 -- tests/testdata/hr/windows-1250.txt | 4 -- tests/testdata/hu/iso-8859-2.txt | 3 - tests/testdata/hu/windows-1250.txt | 1 - tests/testdata/it/iso-8859-1.txt | 18 ------ tests/testdata/it/utf-8.txt | 18 ------ tests/testdata/ja/euc-jp.txt | 10 --- tests/testdata/ja/iso-2022-jp.txt | 8 --- tests/testdata/ja/shift_jis.txt | 1 - tests/testdata/ja/utf-16be.txt | Bin 1416 -> 0 bytes tests/testdata/ja/utf-16le.txt | Bin 1416 -> 0 bytes tests/testdata/ja/utf-8.txt | 9 --- tests/testdata/ko/iso-2022-kr.txt | 8 --- tests/testdata/ko/uhc.smi | 16 ----- tests/testdata/ko/utf-16.le | Bin 376 -> 0 bytes tests/testdata/ko/utf-32.be | Bin 752 -> 0 bytes tests/testdata/ko/utf-8.txt | 3 - tests/testdata/lt/iso-8859-10.txt | 3 - tests/testdata/lt/iso-8859-13.txt | 3 - tests/testdata/lt/iso-8859-4.txt | 3 - tests/testdata/lt/utf-8.txt | 3 - tests/testdata/lv/iso-8859-10.txt | 6 -- tests/testdata/lv/iso-8859-13.txt | 6 -- tests/testdata/lv/iso-8859-4.txt | 6 -- tests/testdata/lv/utf-8.txt | 6 -- tests/testdata/mt/iso-8859-3.txt | 4 -- tests/testdata/mt/utf-8.txt | 4 -- tests/testdata/pl/ibm852.txt | 3 - tests/testdata/pl/iso-8859-13.txt | 3 - tests/testdata/pl/iso-8859-16.txt | 3 - tests/testdata/pl/iso-8859-2.txt | 3 - tests/testdata/pl/maccentraleurope.txt | 3 - tests/testdata/pl/utf-8.txt | 3 - tests/testdata/pl/windows-1250.txt | 3 - tests/testdata/pt/iso-8859-1.txt | 6 -- tests/testdata/pt/utf-8.txt | 6 -- tests/testdata/ro/ibm852.txt | 9 --- tests/testdata/ro/iso-8859-16.txt | 9 --- tests/testdata/ro/utf-8.txt | 9 --- tests/testdata/ro/windows-1250.txt | 9 --- tests/testdata/ru/ibm855.txt | 5 -- tests/testdata/ru/ibm866.txt | 11 ---- tests/testdata/ru/iso-8859-5.txt | 3 - tests/testdata/ru/koi8-r.txt | 1 - tests/testdata/ru/maccyrillic.txt | 9 --- tests/testdata/ru/windows-1251.txt | 4 -- tests/testdata/sk/ibm852.txt | 3 - tests/testdata/sk/iso-8859-2.txt | 3 - tests/testdata/sk/maccentraleurope.txt | 3 - tests/testdata/sk/utf-8.txt | 3 - tests/testdata/sk/windows-1250.txt | 3 - tests/testdata/sl/ibm852.txt | 9 --- tests/testdata/sl/iso-8859-16.txt | 9 --- tests/testdata/sl/iso-8859-2.txt | 9 --- tests/testdata/sl/maccentraleurope.txt | 9 --- tests/testdata/sl/utf-8.txt | 9 --- tests/testdata/sl/windows-1250.txt | 9 --- tests/testdata/sv/iso-8859-1.txt | 10 --- tests/testdata/sv/utf-8.txt | 10 --- tests/testdata/sv/windows-1252.txt | 10 --- tests/testdata/th/iso-8859-11.txt | 5 -- tests/testdata/th/tis-620.txt | 5 -- tests/testdata/th/utf-8.txt | 1 - tests/testdata/tr/iso-8859-3.txt | 13 ---- tests/testdata/tr/iso-8859-9.txt | 13 ---- tests/testdata/vi/utf-8.txt | 4 -- tests/testdata/vi/viscii.txt | 4 -- tests/testdata/vi/windows-1258.txt | 4 -- tests/testdata/zh/big5.txt | 1 - tests/testdata/zh/euc-tw.txt | 1 - tests/testdata/zh/gb18030.txt | 1 - tests/testdata/zh/utf-8.txt | 1 - 122 files changed, 50 insertions(+), 702 deletions(-) delete mode 100644 tests/testdata/ar/iso-8859-6.txt delete mode 100644 tests/testdata/ar/utf-8.txt delete mode 100644 tests/testdata/ar/windows-1256.txt delete mode 100644 tests/testdata/bg/windows-1251.txt delete mode 100644 tests/testdata/cs/ibm852.txt delete mode 100644 tests/testdata/cs/iso-8859-2.txt delete mode 100644 tests/testdata/cs/maccentraleurope.txt delete mode 100644 tests/testdata/cs/utf-8.txt delete mode 100644 tests/testdata/cs/windows-1250.txt delete mode 100644 tests/testdata/da/iso-8859-1.txt delete mode 100644 tests/testdata/da/iso-8859-15.txt delete mode 100644 tests/testdata/da/utf-8.txt delete mode 100644 tests/testdata/da/windows-1252.txt delete mode 100644 tests/testdata/de/iso-8859-1.txt delete mode 100644 tests/testdata/de/windows-1252.txt delete mode 100644 tests/testdata/el/iso-8859-7.txt delete mode 100644 tests/testdata/el/utf-8.txt delete mode 100644 tests/testdata/el/windows-1253.txt delete mode 100644 tests/testdata/en/ascii.txt delete mode 100644 tests/testdata/eo/iso-8859-3.txt delete mode 100644 tests/testdata/es/iso-8859-1.txt delete mode 100644 tests/testdata/es/iso-8859-15.txt delete mode 100644 tests/testdata/es/utf-8.txt delete mode 100644 tests/testdata/es/windows-1252.txt delete mode 100644 tests/testdata/et/iso-8859-13.txt delete mode 100644 tests/testdata/et/iso-8859-15.txt delete mode 100644 tests/testdata/et/iso-8859-4.txt delete mode 100644 tests/testdata/et/utf-8.txt delete mode 100644 tests/testdata/et/windows-1252.txt delete mode 100644 tests/testdata/et/windows-1257.txt delete mode 100644 tests/testdata/fi/iso-8859-1.txt delete mode 100644 tests/testdata/fi/utf-8.txt delete mode 100644 tests/testdata/fr/iso-8859-1.txt delete mode 100644 tests/testdata/fr/iso-8859-15.txt delete mode 100644 tests/testdata/fr/utf-16.be delete mode 100644 tests/testdata/fr/utf-32.le delete mode 100644 tests/testdata/fr/utf-8.txt delete mode 100644 tests/testdata/fr/windows-1252.txt delete mode 100644 tests/testdata/ga/iso-8859-1.txt delete mode 100644 tests/testdata/ga/utf-8.txt delete mode 100644 tests/testdata/ga/windows-1252.txt delete mode 100644 tests/testdata/he/iso-8859-8.txt delete mode 100644 tests/testdata/he/utf-8.txt delete mode 100644 tests/testdata/he/windows-1255.txt delete mode 100644 tests/testdata/hr/ibm852.txt delete mode 100644 tests/testdata/hr/iso-8859-13.txt delete mode 100644 tests/testdata/hr/iso-8859-16.txt delete mode 100644 tests/testdata/hr/iso-8859-2.txt delete mode 100644 tests/testdata/hr/maccentraleurope.txt delete mode 100644 tests/testdata/hr/utf-8.txt delete mode 100644 tests/testdata/hr/windows-1250.txt delete mode 100644 tests/testdata/hu/iso-8859-2.txt delete mode 100644 tests/testdata/hu/windows-1250.txt delete mode 100644 tests/testdata/it/iso-8859-1.txt delete mode 100644 tests/testdata/it/utf-8.txt delete mode 100644 tests/testdata/ja/euc-jp.txt delete mode 100644 tests/testdata/ja/iso-2022-jp.txt delete mode 100644 tests/testdata/ja/shift_jis.txt delete mode 100644 tests/testdata/ja/utf-16be.txt delete mode 100644 tests/testdata/ja/utf-16le.txt delete mode 100644 tests/testdata/ja/utf-8.txt delete mode 100644 tests/testdata/ko/iso-2022-kr.txt delete mode 100644 tests/testdata/ko/uhc.smi delete mode 100644 tests/testdata/ko/utf-16.le delete mode 100644 tests/testdata/ko/utf-32.be delete mode 100644 tests/testdata/ko/utf-8.txt delete mode 100644 tests/testdata/lt/iso-8859-10.txt delete mode 100644 tests/testdata/lt/iso-8859-13.txt delete mode 100644 tests/testdata/lt/iso-8859-4.txt delete mode 100644 tests/testdata/lt/utf-8.txt delete mode 100644 tests/testdata/lv/iso-8859-10.txt delete mode 100644 tests/testdata/lv/iso-8859-13.txt delete mode 100644 tests/testdata/lv/iso-8859-4.txt delete mode 100644 tests/testdata/lv/utf-8.txt delete mode 100644 tests/testdata/mt/iso-8859-3.txt delete mode 100644 tests/testdata/mt/utf-8.txt delete mode 100644 tests/testdata/pl/ibm852.txt delete mode 100644 tests/testdata/pl/iso-8859-13.txt delete mode 100644 tests/testdata/pl/iso-8859-16.txt delete mode 100644 tests/testdata/pl/iso-8859-2.txt delete mode 100644 tests/testdata/pl/maccentraleurope.txt delete mode 100644 tests/testdata/pl/utf-8.txt delete mode 100644 tests/testdata/pl/windows-1250.txt delete mode 100644 tests/testdata/pt/iso-8859-1.txt delete mode 100644 tests/testdata/pt/utf-8.txt delete mode 100644 tests/testdata/ro/ibm852.txt delete mode 100644 tests/testdata/ro/iso-8859-16.txt delete mode 100644 tests/testdata/ro/utf-8.txt delete mode 100644 tests/testdata/ro/windows-1250.txt delete mode 100644 tests/testdata/ru/ibm855.txt delete mode 100644 tests/testdata/ru/ibm866.txt delete mode 100644 tests/testdata/ru/iso-8859-5.txt delete mode 100644 tests/testdata/ru/koi8-r.txt delete mode 100644 tests/testdata/ru/maccyrillic.txt delete mode 100644 tests/testdata/ru/windows-1251.txt delete mode 100644 tests/testdata/sk/ibm852.txt delete mode 100644 tests/testdata/sk/iso-8859-2.txt delete mode 100644 tests/testdata/sk/maccentraleurope.txt delete mode 100644 tests/testdata/sk/utf-8.txt delete mode 100644 tests/testdata/sk/windows-1250.txt delete mode 100644 tests/testdata/sl/ibm852.txt delete mode 100644 tests/testdata/sl/iso-8859-16.txt delete mode 100644 tests/testdata/sl/iso-8859-2.txt delete mode 100644 tests/testdata/sl/maccentraleurope.txt delete mode 100644 tests/testdata/sl/utf-8.txt delete mode 100644 tests/testdata/sl/windows-1250.txt delete mode 100644 tests/testdata/sv/iso-8859-1.txt delete mode 100644 tests/testdata/sv/utf-8.txt delete mode 100644 tests/testdata/sv/windows-1252.txt delete mode 100644 tests/testdata/th/iso-8859-11.txt delete mode 100644 tests/testdata/th/tis-620.txt delete mode 100644 tests/testdata/th/utf-8.txt delete mode 100644 tests/testdata/tr/iso-8859-3.txt delete mode 100644 tests/testdata/tr/iso-8859-9.txt delete mode 100644 tests/testdata/vi/utf-8.txt delete mode 100644 tests/testdata/vi/viscii.txt delete mode 100644 tests/testdata/vi/windows-1258.txt delete mode 100644 tests/testdata/zh/big5.txt delete mode 100644 tests/testdata/zh/euc-tw.txt delete mode 100644 tests/testdata/zh/gb18030.txt delete mode 100644 tests/testdata/zh/utf-8.txt diff --git a/tests/test_1.py b/tests/test_1.py index 0e99996..dd39bf4 100644 --- a/tests/test_1.py +++ b/tests/test_1.py @@ -4,50 +4,67 @@ import cchardet SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) - -SKIP_LIST = [ - "testdata/ja/utf-16le.txt", - "testdata/ja/utf-16be.txt", - "testdata/es/iso-8859-15.txt", - "testdata/da/iso-8859-1.txt", - "testdata/he/iso-8859-8.txt", +TESTDATA_DIR = os.path.join(SCRIPT_DIR, "..", "src", "ext", "uchardet", "test") + +SKIP_LIST_DETECT = [ + "zh/gb18030.txt", + + # These are tests known to fail (not supported or not efficient + # enough). We will have to take a closer look and fix these, but + # there is no need to break the whole `make test` right now, + # which may make actual regressions harder to notice. + "ja/utf-16le.txt", + "ja/utf-16be.txt", + "es/iso-8859-15.txt", + "da/iso-8859-1.txt", + "he/iso-8859-8.txt", ] # Python can"t decode encoding -SKIP_LIST_02 = [ - "testdata/vi/viscii.txt", - "testdata/zh/euc-tw.txt", +SKIP_LIST_DEC = [ + "ka/georgian-academy.txt", + "ka/georgian-ps.txt", + "vi/viscii.txt", + "zh/euc-tw.txt", ] -SKIP_LIST_02.extend(SKIP_LIST) - +SKIP_LIST_DEC.extend(SKIP_LIST_DETECT) class TestCChardet: def test_ascii(self): detected_encoding = cchardet.detect(b"abcdefghijklmnopqrstuvwxyz") - assert "ascii" == detected_encoding["encoding"].lower(), "Expected %s, but got %s" % ( + got_enc = None + if detected_encoding["encoding"] is not None: + got_enc = detected_encoding["encoding"].lower() + assert "ascii" == got_enc, "Expected %s, but got %s" % ( "ascii", - detected_encoding["encoding"].lower(), + got_enc, ) def test_detect(self): - testfiles = glob.glob(SCRIPT_DIR + "/testdata/*/*.txt") + testfiles = glob.glob(TESTDATA_DIR + "/*/*.txt") for testfile in testfiles: - if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST): + if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST_DETECT): print("Skip: %s" % testfile) continue base = os.path.basename(testfile) expected_charset = os.path.splitext(base)[0] + expected_charset = expected_charset.split(".")[0] + if expected_charset == "mac-centraleurope": + expected_charset = "maccentraleurope" with open(testfile, "rb") as f: msg = f.read() detected_encoding = cchardet.detect(msg) print("Test %s: %s" % (testfile, detected_encoding)) - assert detected_encoding["encoding"] is not None, ( + got_enc = None + if detected_encoding["encoding"] is not None: + got_enc = detected_encoding["encoding"].lower() + assert got_enc is not None, ( 'Expected %s, but got None for "%s"' % (expected_charset.lower(), testfile) ) - assert expected_charset.lower() == detected_encoding["encoding"].lower(), ( + assert expected_charset.lower() == got_enc, ( 'Expected %s, but got %s for "%s"' - % (expected_charset.lower(), detected_encoding["encoding"].lower(), testfile) + % (expected_charset.lower(), got_enc, testfile) ) def test_detector(self): @@ -64,9 +81,12 @@ def test_detector(self): line = f.readline() detector.close() detected_encoding = detector.result - assert "shift_jis" == detected_encoding["encoding"].lower(), "Expected %s, but got %s" % ( + got_enc = None + if detected_encoding["encoding"] is not None: + got_enc = detected_encoding["encoding"].lower() + assert "shift_jis" == got_enc, "Expected %s, but got %s" % ( "shift_jis", - detected_encoding["encoding"].lower(), + got_enc, ) def test_github_issue_20(self): @@ -82,15 +102,16 @@ def test_github_issue_20(self): detector.close() def test_decode(self): - testfiles = glob.glob(SCRIPT_DIR + "/testdata/*/*.txt") + testfiles = glob.glob(TESTDATA_DIR + "/*/*.txt") for testfile in testfiles: - if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST_02): + if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST_DEC): print("Skip: %s" % testfile) continue with open(testfile, "rb") as f: msg = f.read() detected_encoding = cchardet.detect(msg) + print("Test %s: %s" % (testfile, detected_encoding)) try: msg.decode(detected_encoding["encoding"]) except LookupError as e: @@ -103,15 +124,10 @@ def test_decode(self): def test_utf8_with_bom(self): sample = b"\xef\xbb\xbf" detected_encoding = cchardet.detect(sample) - assert "utf-8-sig" == detected_encoding["encoding"].lower(), "Expected %s, but got %s" % ( - "utf-8-sig", - detected_encoding["encoding"].lower(), - ) - - def test_null_bytes(self): - sample = b"ABC\x00\x80\x81" - detected_encoding = cchardet.detect(sample) - - assert detected_encoding["encoding"] is None, ( - "Expected None, but got %s" % (detected_encoding["encoding"]) + got_enc = None + if detected_encoding["encoding"] is not None: + got_enc = detected_encoding["encoding"].lower() + assert "utf-8" == got_enc, "Expected %s, but got %s" % ( + "utf-8", + got_enc, ) diff --git a/tests/testdata/ar/iso-8859-6.txt b/tests/testdata/ar/iso-8859-6.txt deleted file mode 100644 index 4831b6a..0000000 --- a/tests/testdata/ar/iso-8859-6.txt +++ /dev/null @@ -1,3 +0,0 @@ --1256 - . -. 8859-6. diff --git a/tests/testdata/ar/utf-8.txt b/tests/testdata/ar/utf-8.txt deleted file mode 100644 index 37e9a11..0000000 --- a/tests/testdata/ar/utf-8.txt +++ /dev/null @@ -1,3 +0,0 @@ -ويندوز-1256 هي صفحة كود تستخدم في كتابة اللغة العربية عموماً وبعض اللغات الشبيهة -التي تستخدم نفس الأبجدية مثل الأردو والفارسية والكوردية. وذلك تحت نظام مايكروسوفت -ويندوز. صفحة الكود هذه لا تتوافق مع الأيزو 8859-6. diff --git a/tests/testdata/ar/windows-1256.txt b/tests/testdata/ar/windows-1256.txt deleted file mode 100644 index 15e257b..0000000 --- a/tests/testdata/ar/windows-1256.txt +++ /dev/null @@ -1,3 +0,0 @@ --1256 - . -. 8859-6. diff --git a/tests/testdata/bg/windows-1251.txt b/tests/testdata/bg/windows-1251.txt deleted file mode 100644 index 550b0a8..0000000 --- a/tests/testdata/bg/windows-1251.txt +++ /dev/null @@ -1,3 +0,0 @@ -Windows-1251 8- () , , , . - -Windows-1251 KOI8-R ( KOI8-U) - ISO 8859-5, . , . diff --git a/tests/testdata/cs/ibm852.txt b/tests/testdata/cs/ibm852.txt deleted file mode 100644 index db9f094..0000000 --- a/tests/testdata/cs/ibm852.txt +++ /dev/null @@ -1,4 +0,0 @@ -Led堟ek n (Alcedo atthis) je prmrn 16,5 cm velk ptk z eledi -led堟kovitch (Alcedinidae). Je velmi vrazn zbarven s oranovou spodinou a -modrm hbetem, kdly a temenem. Vraznm znakem je tak jeho npadn dlouh -zapiatl zobk. Pro sv krsn zbarven je nazvn Ltajc drahokam. diff --git a/tests/testdata/cs/iso-8859-2.txt b/tests/testdata/cs/iso-8859-2.txt deleted file mode 100644 index 95976c4..0000000 --- a/tests/testdata/cs/iso-8859-2.txt +++ /dev/null @@ -1,4 +0,0 @@ -Ledek n (Alcedo atthis) je prmrn 16,5 cm velk ptk z eledi -ledkovitch (Alcedinidae). Je velmi vrazn zbarven s oranovou spodinou a -modrm hbetem, kdly a temenem. Vraznm znakem je tak jeho npadn dlouh -zapiatl zobk. Pro sv krsn zbarven je nazvn Ltajc drahokam. diff --git a/tests/testdata/cs/maccentraleurope.txt b/tests/testdata/cs/maccentraleurope.txt deleted file mode 100644 index 4af3ef8..0000000 --- a/tests/testdata/cs/maccentraleurope.txt +++ /dev/null @@ -1,4 +0,0 @@ -Ledˇek ޒn (Alcedo atthis) je prmrn 16,5 cm velk ptk z eledi -ledˇkovitch (Alcedinidae). Je velmi vrazn zbarven s oranovou spodinou a -modrm hbetem, kޒdly a temenem. Vraznm znakem je tak jeho npadn dlouh -zapiatl zobk. Pro sv krsn zbarven je nazvn Ltajc drahokam. diff --git a/tests/testdata/cs/utf-8.txt b/tests/testdata/cs/utf-8.txt deleted file mode 100644 index 9b61d5e..0000000 --- a/tests/testdata/cs/utf-8.txt +++ /dev/null @@ -1,4 +0,0 @@ -Ledňáček říční (Alcedo atthis) je průměrně 16,5 cm velký pták z čeledi -ledňáčkovitých (Alcedinidae). Je velmi výrazně zbarvený s oranžovou spodinou a -modrým hřbetem, křídly a temenem. Výrazným znakem je také jeho nápadně dlouhý -zašpičatělý zobák. Pro své krásné zbarvení je nazýván Létající drahokam. diff --git a/tests/testdata/cs/windows-1250.txt b/tests/testdata/cs/windows-1250.txt deleted file mode 100644 index 43ac463..0000000 --- a/tests/testdata/cs/windows-1250.txt +++ /dev/null @@ -1,4 +0,0 @@ -Ledek n (Alcedo atthis) je prmrn 16,5 cm velk ptk z eledi -ledkovitch (Alcedinidae). Je velmi vrazn zbarven s oranovou spodinou a -modrm hbetem, kdly a temenem. Vraznm znakem je tak jeho npadn dlouh -zapiatl zobk. Pro sv krsn zbarven je nazvn Ltajc drahokam. diff --git a/tests/testdata/da/iso-8859-1.txt b/tests/testdata/da/iso-8859-1.txt deleted file mode 100644 index f36d4ab..0000000 --- a/tests/testdata/da/iso-8859-1.txt +++ /dev/null @@ -1,7 +0,0 @@ -Dansk er et nord-germansk sprog af den stnordiske (kontinentale) gruppe, der -tales af ca. seks millioner mennesker. Det er strkt pvirket af plattysk. Dansk -tales ogs i Sydslesvig (i Flensborg ca. 20 %) samt p Frerne og Grnland [1]. -Dansk er tt forbundet med norsk. Fra et sprogvidenskabeligt synspunkt kan den -fremherskende form af norsk, bokml (og i endnu hjere grad riksml), betragtes -som dansk, i hvert fald hvad skriftsproget angr. Bde dansk, norsk og svensk er -skandinaviske sprog og minder meget om hinanden. diff --git a/tests/testdata/da/iso-8859-15.txt b/tests/testdata/da/iso-8859-15.txt deleted file mode 100644 index c400e0a..0000000 --- a/tests/testdata/da/iso-8859-15.txt +++ /dev/null @@ -1,10 +0,0 @@ -Eurosymbolet eller eurotegnet () anvendes som valutasymbol for mntenheden -euro. Symbolsk kombinerer det et E eller et grsk epsilon med de to parallelle -streger, man ofte ser i valutasymboler. - -Det vides ikke med sikkerhed, hvem eurosymbolet blev designet af. Nogle medier -hvder, det blev skabt af tidligere designer ved EF Arthur Eisenmenger, mens -andre pstr, det blev skabt af en lille gruppe ledet af Alain Billiet. Muligvis -er ingen af disse forklaringer korrekte, da Den Paneuropiske Union udsendte en -'1 euro'-medalje i 1972, hvorp man kan se et symbol, der i hj grad ligner det -nuvrende eurosymbol. diff --git a/tests/testdata/da/utf-8.txt b/tests/testdata/da/utf-8.txt deleted file mode 100644 index e5e0274..0000000 --- a/tests/testdata/da/utf-8.txt +++ /dev/null @@ -1,10 +0,0 @@ -Eurosymbolet eller eurotegnet (€) anvendes som valutasymbol for møntenheden -euro. Symbolsk kombinerer det et E eller et græsk epsilon med de to parallelle -streger, man ofte ser i valutasymboler. - -Det vides ikke med sikkerhed, hvem eurosymbolet blev designet af. Nogle medier -hævder, det blev skabt af tidligere designer ved EF Arthur Eisenmenger, mens -andre påstår, det blev skabt af en lille gruppe ledet af Alain Billiet. Muligvis -er ingen af disse forklaringer korrekte, da Den Paneuropæiske Union udsendte en -'1 euro'-medalje i 1972, hvorpå man kan se et symbol, der i høj grad ligner det -nuværende eurosymbol. diff --git a/tests/testdata/da/windows-1252.txt b/tests/testdata/da/windows-1252.txt deleted file mode 100644 index db8faf1..0000000 --- a/tests/testdata/da/windows-1252.txt +++ /dev/null @@ -1,10 +0,0 @@ -Eurosymbolet eller eurotegnet () anvendes som valutasymbol for mntenheden -euro. Symbolsk kombinerer det et E eller et grsk epsilon med de to parallelle -streger, man ofte ser i valutasymboler. - -Det vides ikke med sikkerhed, hvem eurosymbolet blev designet af. Nogle medier -hvder, det blev skabt af tidligere designer ved EF Arthur Eisenmenger, mens -andre pstr, det blev skabt af en lille gruppe ledet af Alain Billiet. Muligvis -er ingen af disse forklaringer korrekte, da Den Paneuropiske Union udsendte en -'1 euro'-medalje i 1972, hvorp man kan se et symbol, der i hj grad ligner det -nuvrende eurosymbol. diff --git a/tests/testdata/de/iso-8859-1.txt b/tests/testdata/de/iso-8859-1.txt deleted file mode 100644 index 726a6c8..0000000 --- a/tests/testdata/de/iso-8859-1.txt +++ /dev/null @@ -1,11 +0,0 @@ -ISO 8859-1, genauer ISO/IEC 8859-1, auch bekannt als Latin-1, ist ein von der -ISO zuletzt 1998 aktualisierter Standard fr die Informationstechnik zur -Zeichenkodierung mit acht Bit und der erste Teil der Normenfamilie ISO/IEC 8859. - -Die mit sieben Bit kodierbaren Zeichen entsprechen US-ASCII mit fhrendem -Nullbit. Zustzlich zu den 95 darstellbaren ASCII-Zeichen (2016-7E16) kodiert -ISO 8859-1 96 weitere (A016-FF16), also insgesamt 191 von theoretisch mglichen -256 (= 28). Den Positionen 0016-1F16 und 7F16-9F16 sind in ISO/IEC 8859 und -damit ISO/IEC 8859-1 keine Zeichen zugewiesen. Die von der IANA definierte -Bezeichnung ISO-8859-1 (mit Bindestrich) steht fr die Kombination der Zeichen -dieser Norm mit nicht darstellbaren Steuerzeichen gem ISO/IEC 6429. diff --git a/tests/testdata/de/windows-1252.txt b/tests/testdata/de/windows-1252.txt deleted file mode 100644 index 7c51f46..0000000 --- a/tests/testdata/de/windows-1252.txt +++ /dev/null @@ -1,11 +0,0 @@ -ISO 8859-1, genauer ISO/IEC 8859-1, auch bekannt als Latin-1, ist ein von der -ISO zuletzt 1998 aktualisierter Standard fr die Informationstechnik zur -Zeichenkodierung mit acht Bit und der erste Teil der Normenfamilie ISO/IEC 8859. - -Die mit sieben Bit kodierbaren Zeichen entsprechen US-ASCII mit fhrendem -Nullbit. Zustzlich zu den 95 darstellbaren ASCII-Zeichen (20167E16) kodiert -ISO 8859-1 96 weitere (A016FF16), also insgesamt 191 von theoretisch mglichen -256 (= 28). Den Positionen 00161F16 und 7F169F16 sind in ISO/IEC 8859 und -damit ISO/IEC 8859-1 keine Zeichen zugewiesen. Die von der IANA definierte -Bezeichnung ISO-8859-1 (mit Bindestrich) steht fr die Kombination der Zeichen -dieser Norm mit nicht darstellbaren Steuerzeichen gem ISO/IEC 6429. diff --git a/tests/testdata/el/iso-8859-7.txt b/tests/testdata/el/iso-8859-7.txt deleted file mode 100644 index 9b036c6..0000000 --- a/tests/testdata/el/iso-8859-7.txt +++ /dev/null @@ -1,3 +0,0 @@ - ISO 8859-7, , 8- , ISO 8859. . - - 1987 ELOT 928, 1986. 2003, , . diff --git a/tests/testdata/el/utf-8.txt b/tests/testdata/el/utf-8.txt deleted file mode 100644 index 688779f..0000000 --- a/tests/testdata/el/utf-8.txt +++ /dev/null @@ -1,3 +0,0 @@ -Το UTF-8 (8-bit Unicode Transformation Format) είναι ένα μη-απωλεστικό σχήμα κωδικοποίησης χαρακτήρων μεταβλητού μήκους για το πρότυπο Unicode που δημιουργήθηκε από τους Ken Thompson και Rob Pike. Χρησιμοποιεί ομάδες από byte για να αναπαραστήσει τα κωδικά σημεία του Unicode. Είναι ιδιαίτερα χρήσιμο για μετάδοση δεδομένων σε 8bit συστήματα ηλεκτρονικού ταχυδρομείου. - -Συγκεκριμένα χρησιμοποιεί ένα μέχρι τέσσερα byte ανά χαρακτήρα ανάλογα με το σύμβολο και το κωδικό του σημείο. Για παράδειγμα χρειάζεται μόνο ένα byte του UTF-8 για την κωδικοποίηση των 128 ASCII χαρακτήρες στο διάστημα του Unicode U+0000 μέχρι U+007F. diff --git a/tests/testdata/el/windows-1253.txt b/tests/testdata/el/windows-1253.txt deleted file mode 100644 index 3ccac7b..0000000 --- a/tests/testdata/el/windows-1253.txt +++ /dev/null @@ -1,5 +0,0 @@ -Windows-1253 - - Windows-1253. ( Windows-1253) "A2". ( ), , . - - diff --git a/tests/testdata/en/ascii.txt b/tests/testdata/en/ascii.txt deleted file mode 100644 index 8ffb485..0000000 --- a/tests/testdata/en/ascii.txt +++ /dev/null @@ -1,4 +0,0 @@ -This is an ASCII TEST. -We still want uchardet to detect it as ASCII, even with the presence of -an escape character:  -Or with the HZ encoding escape sequence: ~{ diff --git a/tests/testdata/eo/iso-8859-3.txt b/tests/testdata/eo/iso-8859-3.txt deleted file mode 100644 index 9f35d8a..0000000 --- a/tests/testdata/eo/iso-8859-3.txt +++ /dev/null @@ -1,7 +0,0 @@ -Esperanto (origine Lingvo Internacia) estas la plej disvastigita internacia -planlingvo.[3] La nomo venas de la kanomo "Dr-o Esperanto", sub kiu la juda -kuracisto Ludoviko Lazaro Zamenhofo en la jaro 1887 publikigis la bazon de la -lingvo. La unua versio, la rusa, ricevis la cenzuran permeson disvastii en la -26-a de julio; i tiun daton oni konsideras la naskitago de Esperanto[4][5]. Li -intencis krei facile lerneblan netralan lingvon, tagan por uzo en la -internacia komunikado, tamen ne anstataigi aliajn, naciajn lingvojn. diff --git a/tests/testdata/es/iso-8859-1.txt b/tests/testdata/es/iso-8859-1.txt deleted file mode 100644 index f910c39..0000000 --- a/tests/testdata/es/iso-8859-1.txt +++ /dev/null @@ -1,5 +0,0 @@ -El precio medio de la vivienda nueva es de 2212 EUR/m2, segn datos de la Sociedad -de Tasacin a 31 de diciembre de 2012.156 El precio de la vivienda, sin embargo, -vara ostensiblemente en funcin de las comunidades autnomas y las capitales de -provincia, encontrndose la de mayor valor en Catalua (3146 EUR/m), y en -contraposicin las de Extremadura y Murcia (1271 EUR/m) diff --git a/tests/testdata/es/iso-8859-15.txt b/tests/testdata/es/iso-8859-15.txt deleted file mode 100644 index bc24d88..0000000 --- a/tests/testdata/es/iso-8859-15.txt +++ /dev/null @@ -1,5 +0,0 @@ -El precio medio de la vivienda nueva es de 2212 /m2, segn datos de la Sociedad -de Tasacin a 31 de diciembre de 2012.156 El precio de la vivienda, sin embargo, -vara ostensiblemente en funcin de las comunidades autnomas y las capitales de -provincia, encontrndose la de mayor valor en Catalua (3146 /m), y en -contraposicin las de Extremadura y Murcia (1271 /m) diff --git a/tests/testdata/es/utf-8.txt b/tests/testdata/es/utf-8.txt deleted file mode 100644 index 78d2fa7..0000000 --- a/tests/testdata/es/utf-8.txt +++ /dev/null @@ -1,5 +0,0 @@ -El precio medio de la vivienda nueva es de 2212 €/m2, según datos de la Sociedad -de Tasación a 31 de diciembre de 2012.156 El precio de la vivienda, sin embargo, -varía ostensiblemente en función de las comunidades autónomas y las capitales de -provincia, encontrándose la de mayor valor en Cataluña (3146 €/m²), y en -contraposición las de Extremadura y Murcia (1271 €/m²) diff --git a/tests/testdata/es/windows-1252.txt b/tests/testdata/es/windows-1252.txt deleted file mode 100644 index 9bccea0..0000000 --- a/tests/testdata/es/windows-1252.txt +++ /dev/null @@ -1,5 +0,0 @@ -El precio medio de la vivienda nueva es de 2212 /m2, segn datos de la Sociedad -de Tasacin a 31 de diciembre de 2012.156 El precio de la vivienda, sin embargo, -vara ostensiblemente en funcin de las comunidades autnomas y las capitales de -provincia, encontrndose la de mayor valor en Catalua (3146 /m), y en -contraposicin las de Extremadura y Murcia (1271 /m) diff --git a/tests/testdata/et/iso-8859-13.txt b/tests/testdata/et/iso-8859-13.txt deleted file mode 100644 index b4c8124..0000000 --- a/tests/testdata/et/iso-8859-13.txt +++ /dev/null @@ -1,6 +0,0 @@ -Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. - -Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks -tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud -nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja -lootusetu meeleolu. diff --git a/tests/testdata/et/iso-8859-15.txt b/tests/testdata/et/iso-8859-15.txt deleted file mode 100644 index fc0509a..0000000 --- a/tests/testdata/et/iso-8859-15.txt +++ /dev/null @@ -1,6 +0,0 @@ -Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. - -Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks -tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud -nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja -lootusetu meeleolu. diff --git a/tests/testdata/et/iso-8859-4.txt b/tests/testdata/et/iso-8859-4.txt deleted file mode 100644 index d5532b0..0000000 --- a/tests/testdata/et/iso-8859-4.txt +++ /dev/null @@ -1,6 +0,0 @@ -Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. - -Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks -tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud -nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja -lootusetu meeleolu. diff --git a/tests/testdata/et/utf-8.txt b/tests/testdata/et/utf-8.txt deleted file mode 100644 index d68c9d3..0000000 --- a/tests/testdata/et/utf-8.txt +++ /dev/null @@ -1,6 +0,0 @@ -Anton Pavlovitš Tšehhov oli vene näite- ja novellikirjanik ning praktiseeriv arst. - -Tšehhov on eelkõige tuntud oma novellide poolest. Tema jutustuste tavaliseks -tegevuspaigaks olid vene väikeasulad ja need käsitlesid hingeüksildust, raisatud -õnne jms. Tuntud on ka tema psühholoogilised näidendid, kus valitseb kurb ja -lootusetu meeleolu. diff --git a/tests/testdata/et/windows-1252.txt b/tests/testdata/et/windows-1252.txt deleted file mode 100644 index 597e28e..0000000 --- a/tests/testdata/et/windows-1252.txt +++ /dev/null @@ -1,6 +0,0 @@ -Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. - -Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks -tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud -nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja -lootusetu meeleolu. diff --git a/tests/testdata/et/windows-1257.txt b/tests/testdata/et/windows-1257.txt deleted file mode 100644 index 64d3327..0000000 --- a/tests/testdata/et/windows-1257.txt +++ /dev/null @@ -1,6 +0,0 @@ -Anton Pavlovit Tehhov oli vene nite- ja novellikirjanik ning praktiseeriv arst. - -Tehhov on eelkige tuntud oma novellide poolest. Tema jutustuste tavaliseks -tegevuspaigaks olid vene vikeasulad ja need ksitlesid hingeksildust, raisatud -nne jms. Tuntud on ka tema pshholoogilised nidendid, kus valitseb kurb ja -lootusetu meeleolu. diff --git a/tests/testdata/fi/iso-8859-1.txt b/tests/testdata/fi/iso-8859-1.txt deleted file mode 100644 index 3d584ff..0000000 --- a/tests/testdata/fi/iso-8859-1.txt +++ /dev/null @@ -1,8 +0,0 @@ -Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo -Gernsbackin keksim. Suomessa termin tieteiskirjallisuus loi tohtori Eino -Kauppinen 1950-luvun alkupuolella. -Tieteiskirjallisuudelle on laadittu erilaisia mritelmi. Tieteiskirjallisuuden -rajat eivt ole yksiselitteisen selket. Tieteiskirjallisuus lhenee monia -kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Nill -kolmella lajilla onkin yhteiset juuret 1800-lukua edeltvss ei-realistisessa -kirjallisuudessa. diff --git a/tests/testdata/fi/utf-8.txt b/tests/testdata/fi/utf-8.txt deleted file mode 100644 index ca98172..0000000 --- a/tests/testdata/fi/utf-8.txt +++ /dev/null @@ -1,8 +0,0 @@ -Termi science fiction on amerikkalaisen tieteislehtien toimittajan Hugo -Gernsbackin keksimä. Suomessa termin tieteiskirjallisuus loi tohtori Eino -Kauppinen 1950-luvun alkupuolella. -Tieteiskirjallisuudelle on laadittu erilaisia määritelmiä. Tieteiskirjallisuuden -rajat eivät ole yksiselitteisen selkeät. Tieteiskirjallisuus lähenee monia -kirjallisuudenlajeja, erityisesti kauhu- ja fantasiakirjallisuutta. Näillä -kolmella lajilla onkin yhteiset juuret 1800-lukua edeltävässä ei-realistisessa -kirjallisuudessa. diff --git a/tests/testdata/fr/iso-8859-1.txt b/tests/testdata/fr/iso-8859-1.txt deleted file mode 100644 index 9dfee03..0000000 --- a/tests/testdata/fr/iso-8859-1.txt +++ /dev/null @@ -1,5 +0,0 @@ -La norme ISO 8859-1, dont le nom complet est ISO/CEI 8859-1, et qui est souvent appele Latin-1 ou Europe occidentale, forme la premire partie de la norme internationale ISO/CEI 8859, qui est une norme de l'Organisation internationale de normalisation pour le codage des caractres en informatique. - -Elle dfinit ce qu'elle appelle l'alphabet latin numro 1, qui consiste en 191 caractres de l'alphabet latin, chacun d'entre eux tant cod par un octet (soit 8 bits). ISO 8859-1 reprend le codage des caractres imprimables d'US-ASCII. - -Dans les pays occidentaux, cette norme tait utilise par de nombreux systmes d'exploitation, dont UNIX, Windows ou AmigaOS. Elle a donn lieu quelques extensions et adaptations, dont Windows-1252 et ISO 8859-15. La distinction entre ASCII, ISO 8859-1, ISO 8859-15, Windows-1252 et MacRoman est une source de confusion parmi les dveloppeurs de programmes informatiques. Le Multinational Character Set cr par Digital Equipment Corporation pour le terminal informatique VT220 est considr comme la fois l'anctre de l'ISO 8859-1 et de l'Unicode2. Aujourd'hui, son utilisation tend dcrotre au profit de l'Unicode. diff --git a/tests/testdata/fr/iso-8859-15.txt b/tests/testdata/fr/iso-8859-15.txt deleted file mode 100644 index dc76246..0000000 --- a/tests/testdata/fr/iso-8859-15.txt +++ /dev/null @@ -1,16 +0,0 @@ -L'uf de volaille est un produit agricole servant d'ingrdient entrant dans la -composition de nombreux plats, dans de nombreuses cultures gastronomiques du -monde. - -Le plus utilis est l'uf de poule, mais les ufs d'autres oiseaux sont aussi -consomms : caille, cane, oie, autruche, etc. Les ufs de poissons, comme le -caviar, ou de certains reptiles, comme ceux de l'iguane vert, sont galement -utiliss dans l'alimentation humaine. Cependant, leur utilisation est trs -diffrente de celle des ufs de volaille. - -Les ufs utiliss en cuisine ne sont gnralement pas fconds du fait de leur -provenance d'levages industriels o les coqs sont absents. Fconds ou non, ils -sont utiliss l'tat frais si moins de vingt-huit jours se sont couls aprs -la ponte, selon les normes administratives franaises. Dans les usages -culinaires asiatiques, les ufs sont parfois consomms couvs, comme le balut, -ou mis fermenter pendant plusieurs semaines, comme l'uf de cent ans. diff --git a/tests/testdata/fr/utf-16.be b/tests/testdata/fr/utf-16.be deleted file mode 100644 index 14330f0cfefe9085c6a429a1d7052dfedc1c8a83..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1080 zcmaKrQEtLO3`G5%Q|wm-QiBSuQ2#6S0>A-62sCOSC<#z5-}cRh5FoTduwG-2J!9|p zkIigq?-tt7#^$W&&(P+UTE%^48?4xJTUf$zX?=a}DRGVn z2f_!Za%18XEXVKWgq&ind{ksxV&<0lYB^PcJ8=>JXT%Tfk?+t(oKN)sf^SO?C7<++ z(ygtoRP&bK0MC?8#kAX<*aWusJkoQQq3>1wS7bht8F|*NX6i`QD({yH*Z2wAIy(>} z1n0F>gtT<{4p`zkW=hRmTJ#;r$}DX_cRi|S*vcrWBb+yGYp-iTr2Z3gX?x4R%UVj+ zlWyktA6C6duU>^_qaHvA?U@<%=`Xn1H*M9PZ$>{`PARd3x2mY3I!!nWB2p|Vvc&rG zv7uLOankKtq3--XLCeVyW<8@B+Dzzfgr|4n8E})%M^L3+{Q}fufp$_?igveoLPQ0} zdVaCbt>4fbJ0^4OeK80%Bh6MgdS`kgfAPBP=iY13F6=;+c48>JmVL8pidp&ny8j+3 UG9A1XU%^#~N9=mPF8Q-(zi+C$00000 diff --git a/tests/testdata/fr/utf-32.le b/tests/testdata/fr/utf-32.le deleted file mode 100644 index 13d1139a675bcbc32b06290889b4b832067acc53..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1356 zcmb7^J5EF~3`JA313E7NMuMrRK~$8(0{DZ(|0kA92eSkk;2bGeiV%Z_;wVmHJJ-(3 z`}%BczqK}lU08=HY(l{aJisFqJi&Fhj~H*@7!KeJ^e=ENvCj3?=f025&H3KfMqgv+ za0(aTd5-ERxP|0$uKxIpnxaLGA5h@_IT(Kj))T*FG*SX$XN>)@hw1w>wtMOu zzX!E?2gYc7hTp64`seKNyD8SKX9mB$C%1t9p4!Z59&^|S`<8yVRxrLPYe{YDjJ7Fu!8WMHSbH(*$~opsocnpZu3494 s-%&mGr$D{->F2*A{$r1F7jO7SQ(>D+?cTMytB+Z?J~cWw?*#12H(bDebN~PV diff --git a/tests/testdata/fr/utf-8.txt b/tests/testdata/fr/utf-8.txt deleted file mode 100644 index d786ca3..0000000 --- a/tests/testdata/fr/utf-8.txt +++ /dev/null @@ -1,14 +0,0 @@ -UTF-8 (abréviation de l’anglais Universal Character Set Transformation Format - -8 bits) est un codage de caractères informatiques conçu pour coder l’ensemble -des caractères du « répertoire universel de caractères codés », initialement -développé par l’ISO dans la norme internationale ISO/CEI 10646, aujourd’hui -totalement compatible avec le standard Unicode, en restant compatible avec la -norme ASCII limitée à l’anglais de base (et quelques autres langues beaucoup -moins fréquentes), mais très largement répandue depuis des décennies. - -L’UTF-8 est utilisé par 82,2 % des sites web en décembre 20141. De par sa -nature, UTF-8 est d’un usage de plus en plus courant sur Internet, et dans les -systèmes devant échanger de l'information. Il s’agit également du codage le plus -utilisé dans les systèmes GNU, Linux et compatibles pour gérer le plus -simplement possible des textes et leurs traductions dans tous les systèmes -d’écritures et tous les alphabets du monde. diff --git a/tests/testdata/fr/windows-1252.txt b/tests/testdata/fr/windows-1252.txt deleted file mode 100644 index 5f27c56..0000000 --- a/tests/testdata/fr/windows-1252.txt +++ /dev/null @@ -1,3 +0,0 @@ -Luf de volaille est un produit agricole servant d'ingrdient entrant dans la -composition de nombreux plats, dans de nombreuses cultures gastronomiques du -monde. diff --git a/tests/testdata/ga/iso-8859-1.txt b/tests/testdata/ga/iso-8859-1.txt deleted file mode 100644 index f062a67..0000000 --- a/tests/testdata/ga/iso-8859-1.txt +++ /dev/null @@ -1,6 +0,0 @@ -Ag seo tarma seoltireachta a bhaineann le longa adhmaid agus le bid. - -N bhodh de cheangal idir ire agus tortha eile ach na longa, agus t ire -fin ln de lochanna agus d'aibhneacha. Fgann seo go bhfuil an teanga breac le -tarmaocht seoltireachta agus loingseoireachta agus cuid di tugtha isteach n -Lochlainnis agus n mBarla tr lonnaitheoir n iasacht. diff --git a/tests/testdata/ga/utf-8.txt b/tests/testdata/ga/utf-8.txt deleted file mode 100644 index 33cc012..0000000 --- a/tests/testdata/ga/utf-8.txt +++ /dev/null @@ -1,6 +0,0 @@ -Ag seo téarmaí seoltóireachta a bhaineann le longa adhmaid agus le báid. - -Ní bhíodh de cheangal idir Éire agus tíortha eile ach na longa, agus tá Éire -féin lán de lochanna agus d’aibhneacha. Fágann seo go bhfuil an teanga breac le -téarmaíocht seoltóireachta agus loingseoireachta agus cuid di tugtha isteach ón -Lochlainnis agus ón mBéarla trí lonnaitheoirí ón iasacht. diff --git a/tests/testdata/ga/windows-1252.txt b/tests/testdata/ga/windows-1252.txt deleted file mode 100644 index 1a97dae..0000000 --- a/tests/testdata/ga/windows-1252.txt +++ /dev/null @@ -1,6 +0,0 @@ -Ag seo tarma seoltireachta a bhaineann le longa adhmaid agus le bid. - -N bhodh de cheangal idir ire agus tortha eile ach na longa, agus t ire -fin ln de lochanna agus daibhneacha. Fgann seo go bhfuil an teanga breac le -tarmaocht seoltireachta agus loingseoireachta agus cuid di tugtha isteach n -Lochlainnis agus n mBarla tr lonnaitheoir n iasacht. diff --git a/tests/testdata/he/iso-8859-8.txt b/tests/testdata/he/iso-8859-8.txt deleted file mode 100644 index bc78eee..0000000 --- a/tests/testdata/he/iso-8859-8.txt +++ /dev/null @@ -1,2 +0,0 @@ - ISO 8859 , , : - 0xA0 0xBF , , . diff --git a/tests/testdata/he/utf-8.txt b/tests/testdata/he/utf-8.txt deleted file mode 100644 index 1fbe418..0000000 --- a/tests/testdata/he/utf-8.txt +++ /dev/null @@ -1,3 +0,0 @@ -(ראשי תיבות של 8‎-bit Unicode Transformation Format או 8‎-bit UCS Transformation Format) הוא קידוד תווים באורך משתנה ליוניקוד, שנוצר על ידי רוב פייק וקן תומפסון. ניתן לקודד בו כל תו המצוי בתקן יוניקוד על ידי שימוש באחד עד ארבעה בתים, תלוי בתו. הקידוד ב-UTF-8 מעניק את כל יתרונות השימוש בקידוד ליוניקוד ומוסיף עליהם, בין היתר, גם חיסכון בזיכרון, עמידות בפני איבוד או השחתת בתים ותאימות לאחור ל-ASCII. ה-IETF מעדיף בבירור את UTF-8 ומחייב כל פרוטוקול אינטרנט לתמוך בו, וכן קונסורציום הדואר האלקטרוני, ה-IMC, ממליץ שכל תוכנת דואר אלקטרוני תוכל להציג וליצור דואר באמצעות UTF-8. - - diff --git a/tests/testdata/he/windows-1255.txt b/tests/testdata/he/windows-1255.txt deleted file mode 100644 index d915bd4..0000000 --- a/tests/testdata/he/windows-1255.txt +++ /dev/null @@ -1 +0,0 @@ - , -, , . diff --git a/tests/testdata/hr/ibm852.txt b/tests/testdata/hr/ibm852.txt deleted file mode 100644 index 98f5138..0000000 --- a/tests/testdata/hr/ibm852.txt +++ /dev/null @@ -1,4 +0,0 @@ -Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice -Rosaceae. -Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj -Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/tests/testdata/hr/iso-8859-13.txt b/tests/testdata/hr/iso-8859-13.txt deleted file mode 100644 index 124b1c0..0000000 --- a/tests/testdata/hr/iso-8859-13.txt +++ /dev/null @@ -1,4 +0,0 @@ -Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice -Rosaceae. -Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj -Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/tests/testdata/hr/iso-8859-16.txt b/tests/testdata/hr/iso-8859-16.txt deleted file mode 100644 index a10a108..0000000 --- a/tests/testdata/hr/iso-8859-16.txt +++ /dev/null @@ -1,4 +0,0 @@ -Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice -Rosaceae. -Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj -Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/tests/testdata/hr/iso-8859-2.txt b/tests/testdata/hr/iso-8859-2.txt deleted file mode 100644 index 7c33320..0000000 --- a/tests/testdata/hr/iso-8859-2.txt +++ /dev/null @@ -1,4 +0,0 @@ -Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice -Rosaceae. -Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj -Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/tests/testdata/hr/maccentraleurope.txt b/tests/testdata/hr/maccentraleurope.txt deleted file mode 100644 index 27c41e0..0000000 --- a/tests/testdata/hr/maccentraleurope.txt +++ /dev/null @@ -1,4 +0,0 @@ -Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice -Rosaceae. -Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj -Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/tests/testdata/hr/utf-8.txt b/tests/testdata/hr/utf-8.txt deleted file mode 100644 index 92cc1dd..0000000 --- a/tests/testdata/hr/utf-8.txt +++ /dev/null @@ -1,4 +0,0 @@ -Brekinja (lat. Sorbus torminalis) je bjelogorična vrsta drveća iz porodice -Rosaceae. -Prirodno je rasprostranjena u zapadnoj, srednjoj i južnoj Europi, sjevernoj -Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/tests/testdata/hr/windows-1250.txt b/tests/testdata/hr/windows-1250.txt deleted file mode 100644 index 60d8c98..0000000 --- a/tests/testdata/hr/windows-1250.txt +++ /dev/null @@ -1,4 +0,0 @@ -Brekinja (lat. Sorbus torminalis) je bjelogorina vrsta drvea iz porodice -Rosaceae. -Prirodno je rasprostranjena u zapadnoj, srednjoj i junoj Europi, sjevernoj -Africi, Krimu, Maloj Aziji, Kavkazu i Transkavkaziji. diff --git a/tests/testdata/hu/iso-8859-2.txt b/tests/testdata/hu/iso-8859-2.txt deleted file mode 100644 index cb0dc11..0000000 --- a/tests/testdata/hu/iso-8859-2.txt +++ /dev/null @@ -1,3 +0,0 @@ -Az ISO 8859-2 (hivatalosan ISO/IEC 8859-2, rviden s nem hivatalosan Latin-2) az ISO/IEC 8859-es karakterkdolsi szabvny msodik rsze. Az ISO ltal kettes szm latin bcnek nevezett 191 karakter mindegyiknek egybjtos (nyolcbites) kdjt adja meg. A 191 karakter kztt minden magyar kezetes bet megtallhat (a sok ms kszletbl hinyz s is). - -Az ISO_8859-2:1987 (mime rvidtsbl ismertebb nevn ISO-8859-2 (az "ISO" utn ktjellel)) az IANA-nak erre a szabvnyra pl karakterkszletnek neve, melyben a C0 (0x00-0x1F) s a C1 (0x80-0x9F) rsz az ISO/IEC 6429-ben meghatrozott vezrlkdokat tartalmazza. Az ISO/IEC 6429-ben s 2022-ben megadott escape szekvencikat nem hasznlja. Tovbbi ismert nevei: ISO_8859-2, latin2, l2 s csISOLatin2. diff --git a/tests/testdata/hu/windows-1250.txt b/tests/testdata/hu/windows-1250.txt deleted file mode 100644 index ada5ffb..0000000 --- a/tests/testdata/hu/windows-1250.txt +++ /dev/null @@ -1 +0,0 @@ -Jellemz r az els sztagra es hangsly (ebben a finnugor nyelvek s a szlovk nyelv hasonltanak hozz), a magnhangz-harmnia (barnulsotokrl zldlsetekrl), valamint a magnhangz-hosszsg s a hangsly egymstl fggetlen volta (amely szinte egyedliknt lehetv teszi az antik Idmrtkes versels alkalmazst). Hangrendszerre ezenkvl a lgy mssalhangzk (ny, ty, gy), az aspirlatlan zrhangok (h nlkl ejtett p, t, k, szemben pldul a germn nyelvekkel) s a palatlis magnhangzk eltti kemny mssalhangzk jelenlte jellemz (azaz lehetsges ne, ti stb. hangkapcsolat, nye, tyi helyett; szemben pldul az orosszal). Nincsenek benne valdi diftongusok (mint pldul a finnben vagy nmetben) s reduklt, vagyis elnyelt magnhangzk (mint pldul az angolban, nmetben). A specilis magyar a hang (mely a svdben s a perzsban is megvan) nehzsget okozhat a nyelvnket tanulknak. diff --git a/tests/testdata/it/iso-8859-1.txt b/tests/testdata/it/iso-8859-1.txt deleted file mode 100644 index 0afbc36..0000000 --- a/tests/testdata/it/iso-8859-1.txt +++ /dev/null @@ -1,18 +0,0 @@ -L'architettura longobarda costituita dall'insieme delle opere architettoniche -realizzate in Italia durante il regno dei Longobardi (568-774), con residuale -permanenza nell'Italia meridionale fino al X-XI secolo (Langobardia Minor), e -commissionate dai re e dai duchi longobardi. -L'attivit architettonica sviluppata in Langobardia Maior andata in gran parte -perduta, per lo pi a causa di successive ricostruzioni degli edifici sacri e -profani eretti tra VII e VIII secolo. A parte il Tempietto longobardo di -Cividale del Friuli, rimasto in gran parte intatto, gli edifici civili e -religiosi di Pavia, Monza o altre localit sono stati ampiamente rimaneggiati -nei secoli seguenti. Ancora integre rimangono cos soltanto poche architetture, -o perch inglobate negli ampliamenti successivi - come la chiesa di San -Salvatore a Brescia) -, o perch periferiche e di modeste dimensioni - come la -chiesa di Santa Maria foris portas a Castelseprio. Testimonianze maggiormente -fedeli alla forma originale si ritrovano, invece, nella Langobardia Minor: a -Benevento si conservano la chiesa di Santa Sofia, un ampio tratto delle Mura e -la Rocca dei Rettori, unici esempi superstiti di architettura militare -longobarda, mentre altre testimonianze si sono conservate in centri minori del -ducato beneventano e a Spoleto. diff --git a/tests/testdata/it/utf-8.txt b/tests/testdata/it/utf-8.txt deleted file mode 100644 index cefd085..0000000 --- a/tests/testdata/it/utf-8.txt +++ /dev/null @@ -1,18 +0,0 @@ -L'architettura longobarda è costituita dall'insieme delle opere architettoniche -realizzate in Italia durante il regno dei Longobardi (568-774), con residuale -permanenza nell'Italia meridionale fino al X-XI secolo (Langobardia Minor), e -commissionate dai re e dai duchi longobardi. -L'attività architettonica sviluppata in Langobardia Maior è andata in gran parte -perduta, per lo più a causa di successive ricostruzioni degli edifici sacri e -profani eretti tra VII e VIII secolo. A parte il Tempietto longobardo di -Cividale del Friuli, rimasto in gran parte intatto, gli edifici civili e -religiosi di Pavia, Monza o altre località sono stati ampiamente rimaneggiati -nei secoli seguenti. Ancora integre rimangono così soltanto poche architetture, -o perché inglobate negli ampliamenti successivi - come la chiesa di San -Salvatore a Brescia) -, o perché periferiche e di modeste dimensioni - come la -chiesa di Santa Maria foris portas a Castelseprio. Testimonianze maggiormente -fedeli alla forma originale si ritrovano, invece, nella Langobardia Minor: a -Benevento si conservano la chiesa di Santa Sofia, un ampio tratto delle Mura e -la Rocca dei Rettori, unici esempi superstiti di architettura militare -longobarda, mentre altre testimonianze si sono conservate in centri minori del -ducato beneventano e a Spoleto. diff --git a/tests/testdata/ja/euc-jp.txt b/tests/testdata/ja/euc-jp.txt deleted file mode 100644 index 4bd0c16..0000000 --- a/tests/testdata/ja/euc-jp.txt +++ /dev/null @@ -1,10 +0,0 @@ -Extended Unix Code(EUC)ϡUNIXǤ褯Ȥʸɤ沽Ǥ롣 - - ܸEUC - JIS X 0208١ (EUC-JP) - JIS X 0213١ (EUC-JIS-2004) - ڹEUC (EUC-KR) - λEUC (EUC-CN) - λEUC (EUC-TW) - -ʤɤ롣 diff --git a/tests/testdata/ja/iso-2022-jp.txt b/tests/testdata/ja/iso-2022-jp.txt deleted file mode 100644 index c0dc891..0000000 --- a/tests/testdata/ja/iso-2022-jp.txt +++ /dev/null @@ -1,8 +0,0 @@ -ISO/IEC 2022$B!J5l>N(B ISO 2022$B!K$O!"(B - - $BJ8;z=89g$r(B7$B%S%C%HId9f$^$?$O(B8$B%S%C%HId9f$GI=8=$9$k$?$a$N5;=Q!"$*$h$S(B - $BJ#?t$NJ8;z=89g$rC10l$NJ8;zId9f2=J}<0$K4^$a$k5;=Q(B - -$B$r5,Dj$9$k(BISO$B5,3J$G$"$k!#(BJIS$B$NBP1~5,3J$O(BJIS X 0202 $B!V>pJs5;=Q(B-$BJ8;zId9f$N9=B$5Z$S3HD%K!!W(B[1]$B!#(BEcma International$B$NBP1~5,3J$O(BECMA-35$B!#(B - -ISO/IEC 2022 $B$NId9f2=J}<0$O!"0lHL$K!"(B1$BJ8;z$K(B1$B%P%$%H$+(B2$B%P%$%H0J>e$r;H$&2DJQD9$NJ8;zId9f2=J}<0$G$"$k!#$$$/$D$+$NId9f2=I=8=$,(BISO/IEC 2022$B$N5!9=$r;H$C$F$$$k!#$?$H$($P!"(BISO-2022-JP$B$OF|K\8l$G9-$/;H$o$l$F$$$kId9f2=I=8=$G$"$j!"$$$o$f$k!V(BJIS$B%3!<%I!W$H$$$&$N$b$3$l$r;X$9$3$H$,0lHLE*$G$"$k!#(B diff --git a/tests/testdata/ja/shift_jis.txt b/tests/testdata/ja/shift_jis.txt deleted file mode 100644 index a580281..0000000 --- a/tests/testdata/ja/shift_jis.txt +++ /dev/null @@ -1 +0,0 @@ -{{{{{{{{{{{{{{{{{{{ diff --git a/tests/testdata/ja/utf-16be.txt b/tests/testdata/ja/utf-16be.txt deleted file mode 100644 index 7703c2f3816a1c3e7c3a1d97b8d6c388925a4c84..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1416 zcma)6J7^S96usGMGZ@mPu$oE~44&1H!ZJTeoNOYqOAIMivzeWVYu@U{EkqC!5R=tH zB4THuDQpBGRayl z1FJI08Z)diDdVi?@B~p~+b>r6^xA5ESn!|%RbJCodtPzU1 zZ;&69T~*7a{zBi)@1~3Y*V(I$RnKbV$gEs7ss_~6rG^zg&r#i)G~^oB5Ty7_If7BE zZQDKY+Q?c8IAk{EujEWl>j#f4(sTHT*LVkI^t?DT4Nr2^@HR^9@3dAkQ9t1ZJ~{Gm z#5xvzPzC-PxiDzqockS~oIl!>=99;ZaaKlTT4v+k3wNMQU7~K)GL=*cMYxe@5N8dp zmw1k!c#CIv%J&s)?$N!=x9n7_F-X)=eyjZ1;-N1y7DN8q{J=gKbdO5T@RVhujR6@} zoHF;P!?EV%1GdT74)4NyTtd{`-u#+4$EW{2<9@PBrWEhm1gbKWoKcKLST{RLSbOyF zK@npu-l&yUE3I*r^~X@rO8YwtZ&1~%&h67TG|sO7kh65ks6=#weBI9dCpNyO*Lg`OAfey#Rq5-2)i&GMHxqCF1`0_Ttjd?Jd7Z$*NC lP*`S?$zdIK7zCD=G;XN;C-R=^gdQ00?oB*KtXdgy{sNy;TD<@O diff --git a/tests/testdata/ja/utf-16le.txt b/tests/testdata/ja/utf-16le.txt deleted file mode 100644 index b04c2f83825950e614af4cd272424370a9a78fa7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1416 zcmb7EJ!n%=82y?lc^I;FaLObK7Efw%a7=5Z^)*#qLM;gNq<^a#iqW69g9*LEIGwL4@bI?_)l3{a*?q$sWaA+RlZ&0jwWMyo)E|K(OE|r4ba30k|)ig zgEh>s#yqRc$SKydm;u(G;8!u@(2bwYsI%)tg#b19DB=!MtgAcQi@+zUBqr-Wx>NAlBSZLg$9??fbj{)4IkOU+=~Dw#a5qAp zWG8vLLbZ`|v*mBsbr=7xGpvnK&uC=HS-Bjk>QUD`HLUaZG}UcNTdr^oequfc9ijWw z+Q#m&cailoEGj#=!{ii1>t{1td+q~X;Vo38`FUq<-e$Chn{avlI9Gnp4SjALa%AHG zjAOIR%$5}jZO`vl{k3q^Ke#B}N8ROz>98ns>@pMHI;WuPYSh|0x|XiQ++7qtf{v+^ z4mGalc#0o*gC}^*=Ot_o>7MkhQDaQT>HDqHr>en|2~#i_zLrK8_hQgJDja8)VIszm zV#V1Zjx{Imu}#KyI1BG^fsWtqe%Umr9lqI-`!%}`&+T9u4NUZM6k`E4q8%>QasNT( z-qh~A7_|m9Zm|9o>XFhr&g-Q{w2G%sPiQ*)zHyRHnUsJ|(sx9a_kzb&p0llbH^>TC zKHbp2C-k%$_O+8WQ(YoH!|6?WD!vI69kI@i;_C5zpb=R6MBornKzytG4x=#4^luL1 du>Bw~ybL;B;bW24B 0!:/ 9.@Z :NH#H- 9f=D@L8g GQ19>n@G 0f?l EUC-KR@L @L :NH#H- 9f=D@; ;g?kGQ4Y. diff --git a/tests/testdata/ko/uhc.smi b/tests/testdata/ko/uhc.smi deleted file mode 100644 index 1b71cb6..0000000 --- a/tests/testdata/ko/uhc.smi +++ /dev/null @@ -1,16 +0,0 @@ - - -EUC-KR.smi - - - -

EUC-KR -

EUC-KR KS X 1001 KS X 1003 ϴ 8Ʈ ڵ, EUC ̸ ǥ ѱ ϼ ڵ̱ ϼ̶ Ҹ. -

EUC-KR ڵ ȴ. -

128 Ʈ KS X 1003 Ѵ. -

128 ũų Ʈ KS X 1001 Ѵ. ڴ 128 ڵ尪 Ͽ 2Ʈ ǥȴ. -

KS X 1001 40-27 "" ڴ EUC-KR C0 A7 Ʈ ǥȴ. -

KS X 1001 ѱ ä ڸ Ͽ ԰ տ Ե ѱ ǥϴ Ȯ , κ EUC-KR ʰ CP949 ٸ Ͽ KS X 1001 ٱ ѱ ǥѴ. - - diff --git a/tests/testdata/ko/utf-16.le b/tests/testdata/ko/utf-16.le deleted file mode 100644 index 8fe2e10b77f3921bc808614e8e84d6c3858bf23c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 376 zcmY+9y-EX75QR^@r68oRn9>C!{(x9mgdlc;#Rj&pi5i2DA4#s@BiINQ7QTY85m~Pg zb&*7p+)WjUEH)7=_90S;I7zg3t22<4fDyH#U75JKq#QbzkSqk zfD&s54HTI*amcKJDl5f~Iy2&wKV~Cm2xLOv8BH`{{po`Bsc}a3J;4q>UR-0rjjnE$ ziTy>M{t9>71g~>F$mmO-uyivG2(e#n={{ljrXGp=<*|(&EU-V%yPHWn;vOfcBuy$z zYACZVG9euED~(IA@!Gd=-gP5Mmp-wUj?)(IWV77A`P?28X1om}ByFqMlhL1^I0?#h kJ1!HMmz;-jFF1R*FdM3Y({inaU-;*MApigX diff --git a/tests/testdata/ko/utf-32.be b/tests/testdata/ko/utf-32.be deleted file mode 100644 index 37162f519caed75711510f23d833daccb30cf97e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 752 zcmZvZIZJ~<5Qe8tDF_x8TN+UD0I{$T5j(*`A%#sm33%ZF`3W|Hg@u1X_#zTNAs$E~ zYLZHch>e1UKf*%P=XJ9tMT&>nedpMjS+nmSGn;{Fn1CMWhau={I!m8}3~Yk~<*k9V zd02-9SOf>Exd~}d+!ic>;-s&Fe5$7!(iP`Gy|ju;nw3ZKIV|KWW)Hj*zk2O*7=6F) z&wAe(If}o+-n`*YaZi{%V^0pb^M3vVt=Gp@zRU`<{#=aA?vJS7LCk^Ij{B(f>Cei` zF~?o(?@v{;3+nyNp#6=(Buqg!IFObG?IQ!)odfM-2Ub8eJ zG1`5Sejc&RSlaV*hTg+|V)_ny%rVO;;bG?R|JUPYwE_-j7PS#S(ugT%8@tv16r&wQ z@BXZs(&Q_auql5J?}vVB=j`&<>}3}}{Z`uPmz*k^^8f#=>1`XtH((a}3w^ijsk7;C F6Tc&ufFS?? diff --git a/tests/testdata/ko/utf-8.txt b/tests/testdata/ko/utf-8.txt deleted file mode 100644 index d3ec5d4..0000000 --- a/tests/testdata/ko/utf-8.txt +++ /dev/null @@ -1,3 +0,0 @@ -UTF-8은 유니코드를 위한 가변 길이 문자 인코딩 방식 중 하나로, 켄 톰프슨과 롭 파이크가 만들었다. 본래는 FSS-UTF(File System Safe UCS/Unicode Transformation Format)라는 이름으로 제안되었다. - -UTF-8 인코딩은 유니코드 한 문자를 나타내기 위해 1바이트에서 4바이트까지를 사용한다. 예를 들어서, U+0000부터 U+007F 범위에 있는 ASCII 문자들은 UTF-8에서 1바이트만으로 표시된다. 4바이트로 표현되는 문자는 모두 기본 다국어 평면(BMP) 바깥의 유니코드 문자이며, 거의 사용되지 않는다. UTF-16과 UTF-8 중 어느 인코딩이 더 적은 바이트를 사용하는지는 문자열에서 사용된 코드 포인트에 따라 달라지며, 실제로 DEFLATE와 같은 일반적인 압축 알고리즘을 사용할 경우 이 차이는 무시할 수 있을 정도이다. 이러한 압축 알고리즘을 사용하기 힘들고 크기가 중요할 경우 유니코드 표준 압축 방식을 대신 사용할 수 있다. diff --git a/tests/testdata/lt/iso-8859-10.txt b/tests/testdata/lt/iso-8859-10.txt deleted file mode 100644 index d005822..0000000 --- a/tests/testdata/lt/iso-8859-10.txt +++ /dev/null @@ -1,3 +0,0 @@ -Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, -Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland -tapytojas ir grafikas, postimpresionistas. diff --git a/tests/testdata/lt/iso-8859-13.txt b/tests/testdata/lt/iso-8859-13.txt deleted file mode 100644 index 1e3b02e..0000000 --- a/tests/testdata/lt/iso-8859-13.txt +++ /dev/null @@ -1,3 +0,0 @@ -Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, -Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland -tapytojas ir grafikas, postimpresionistas. diff --git a/tests/testdata/lt/iso-8859-4.txt b/tests/testdata/lt/iso-8859-4.txt deleted file mode 100644 index d5ee32a..0000000 --- a/tests/testdata/lt/iso-8859-4.txt +++ /dev/null @@ -1,3 +0,0 @@ -Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, -Nyderlandai - 1890 m. liepos 29 d. Overe prie Uazos, Pranczija) - oland -tapytojas ir grafikas, postimpresionistas. diff --git a/tests/testdata/lt/utf-8.txt b/tests/testdata/lt/utf-8.txt deleted file mode 100644 index de425f9..0000000 --- a/tests/testdata/lt/utf-8.txt +++ /dev/null @@ -1,3 +0,0 @@ -Vincentas van Gogas (ol. Vincent van Gogh, 1853 m. kovo 30 d. Grot Zunderte, -Nyderlandai – 1890 m. liepos 29 d. Overe prie Uazos, Prancūzija) – olandų -tapytojas ir grafikas, postimpresionistas. diff --git a/tests/testdata/lv/iso-8859-10.txt b/tests/testdata/lv/iso-8859-10.txt deleted file mode 100644 index 6afb063..0000000 --- a/tests/testdata/lv/iso-8859-10.txt +++ /dev/null @@ -1,6 +0,0 @@ -Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. -gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, -postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to -skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja -pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis -20. gadsimta mkslu, tostarp ekspresionismu un fovismu. diff --git a/tests/testdata/lv/iso-8859-13.txt b/tests/testdata/lv/iso-8859-13.txt deleted file mode 100644 index bd4691d..0000000 --- a/tests/testdata/lv/iso-8859-13.txt +++ /dev/null @@ -1,6 +0,0 @@ -Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. -gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, -postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to -skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja -pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis -20. gadsimta mkslu, tostarp ekspresionismu un fovismu. diff --git a/tests/testdata/lv/iso-8859-4.txt b/tests/testdata/lv/iso-8859-4.txt deleted file mode 100644 index 7fd134d..0000000 --- a/tests/testdata/lv/iso-8859-4.txt +++ /dev/null @@ -1,6 +0,0 @@ -Vinsents Villems van Gogs (nderlandieu: Vincent Willem van Gogh, dzimis 1853. -gada 30. mart, miris 1890. gada 29. jlij) bija nderlandieu gleznotjs, -postimpresionisma prstvis. Kopum van Gogs radja vairk nek 2000 darbu, to -skait 900 gleznu un 1100 zmjumu un skiu. Savus slavenkos darbus vi radja -pdjo divu dzves gadu laik. Tiek uzskatts, ka van Gogs btiski ir ietekmjis -20. gadsimta mkslu, tostarp ekspresionismu un fovismu. diff --git a/tests/testdata/lv/utf-8.txt b/tests/testdata/lv/utf-8.txt deleted file mode 100644 index 4a4d3c9..0000000 --- a/tests/testdata/lv/utf-8.txt +++ /dev/null @@ -1,6 +0,0 @@ -Vinsents Villems van Gogs (nīderlandiešu: Vincent Willem van Gogh, dzimis 1853. -gada 30. martā, miris 1890. gada 29. jūlijā) bija nīderlandiešu gleznotājs, -postimpresionisma pārstāvis. Kopumā van Gogs radīja vairāk nekā 2000 darbu, to -skaitā 900 gleznu un 1100 zīmējumu un skiču. Savus slavenākos darbus viņš radīja -pēdējo divu dzīves gadu laikā. Tiek uzskatīts, ka van Gogs būtiski ir ietekmējis -20. gadsimta mākslu, tostarp ekspresionismu un fovismu. diff --git a/tests/testdata/mt/iso-8859-3.txt b/tests/testdata/mt/iso-8859-3.txt deleted file mode 100644 index 255269b..0000000 --- a/tests/testdata/mt/iso-8859-3.txt +++ /dev/null @@ -1,4 +0,0 @@ -Franza (Frani:France), uffijalment ir-Repubblika Frania (Frani: -Rpublique franaise), hi pajji fl-Ewropa tal-Punent. Il-belt belt kapitali -tagha hi Parii. Hi membru tal-Unjoni Ewropea. Franza hi maqsuma f'22 rgions -li huma suddivii f' dpartements. diff --git a/tests/testdata/mt/utf-8.txt b/tests/testdata/mt/utf-8.txt deleted file mode 100644 index 079f387..0000000 --- a/tests/testdata/mt/utf-8.txt +++ /dev/null @@ -1,4 +0,0 @@ -Franza (Franċiż:France), uffiċjalment ir-Repubblika Franċiża (Franċiż: -République française), hi pajjiż fl-Ewropa tal-Punent. Il-belt belt kapitali -tagħha hi Pariġi. Hi membru tal-Unjoni Ewropea. Franza hi maqsuma f'22 régions -li huma suddiviżi f' départements. diff --git a/tests/testdata/pl/ibm852.txt b/tests/testdata/pl/ibm852.txt deleted file mode 100644 index e420950..0000000 --- a/tests/testdata/pl/ibm852.txt +++ /dev/null @@ -1,3 +0,0 @@ -Zofia (Sonka) Holszaska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) -ksiniczka litewska, krlowa Polski, od 1422 roku czwarta i ostatnia ona Wadysawa II -Jagiey. diff --git a/tests/testdata/pl/iso-8859-13.txt b/tests/testdata/pl/iso-8859-13.txt deleted file mode 100644 index 6bafbc1..0000000 --- a/tests/testdata/pl/iso-8859-13.txt +++ /dev/null @@ -1,3 +0,0 @@ -Zofia (Sonka) Holszaska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) -ksiniczka litewska, krlowa Polski, od 1422 roku czwarta i ostatnia ona Wadysawa II -Jagiey. diff --git a/tests/testdata/pl/iso-8859-16.txt b/tests/testdata/pl/iso-8859-16.txt deleted file mode 100644 index abe9607..0000000 --- a/tests/testdata/pl/iso-8859-16.txt +++ /dev/null @@ -1,3 +0,0 @@ -Zofia (Sonka) Holszaska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) -ksiniczka litewska, krlowa Polski, od 1422 roku czwarta i ostatnia ona Wadysawa II -Jagiey. diff --git a/tests/testdata/pl/iso-8859-2.txt b/tests/testdata/pl/iso-8859-2.txt deleted file mode 100644 index 8ff7d6d..0000000 --- a/tests/testdata/pl/iso-8859-2.txt +++ /dev/null @@ -1,3 +0,0 @@ -Zofia (Sonka) Holszaska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) -ksiniczka litewska, krlowa Polski, od 1422 roku czwarta i ostatnia ona Wadysawa II -Jagiey. diff --git a/tests/testdata/pl/maccentraleurope.txt b/tests/testdata/pl/maccentraleurope.txt deleted file mode 100644 index 48c5901..0000000 --- a/tests/testdata/pl/maccentraleurope.txt +++ /dev/null @@ -1,3 +0,0 @@ -Zofia (Sonka) Holszaska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) -ksiniczka litewska, krlowa Polski, od 1422 roku czwarta i ostatnia ona Wadysawa II -Jagiey. diff --git a/tests/testdata/pl/utf-8.txt b/tests/testdata/pl/utf-8.txt deleted file mode 100644 index 8b7e938..0000000 --- a/tests/testdata/pl/utf-8.txt +++ /dev/null @@ -1,3 +0,0 @@ -Zofia (Sonka) Holszańska herbu Hippocentaurus (ur. ok. 1405, zm. 21 września 1461 w Krakowie) -księżniczka litewska, królowa Polski, od 1422 roku czwarta i ostatnia żona Władysława II -Jagiełły. diff --git a/tests/testdata/pl/windows-1250.txt b/tests/testdata/pl/windows-1250.txt deleted file mode 100644 index c739798..0000000 --- a/tests/testdata/pl/windows-1250.txt +++ /dev/null @@ -1,3 +0,0 @@ -Zofia (Sonka) Holszaska herbu Hippocentaurus (ur. ok. 1405, zm. 21 wrzenia 1461 w Krakowie) -ksiniczka litewska, krlowa Polski, od 1422 roku czwarta i ostatnia ona Wadysawa II -Jagiey. diff --git a/tests/testdata/pt/iso-8859-1.txt b/tests/testdata/pt/iso-8859-1.txt deleted file mode 100644 index ec6bdda..0000000 --- a/tests/testdata/pt/iso-8859-1.txt +++ /dev/null @@ -1,6 +0,0 @@ -Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canad), -foi um gnero de dinossauro carnvoro e bpede presente no fim do perodo -Cretceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e -pesava menos de 2 toneladas. O Albertossauro viveu na Amrica do Norte e foi -descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canad, local -ao qual deve seu nome. diff --git a/tests/testdata/pt/utf-8.txt b/tests/testdata/pt/utf-8.txt deleted file mode 100644 index 1729291..0000000 --- a/tests/testdata/pt/utf-8.txt +++ /dev/null @@ -1,6 +0,0 @@ -Albertossauro (Albertosaurus sp., que significa "lagarto de Alberta" no Canadá), -foi um género de dinossauro carnívoro e bípede presente no fim do período -Cretáceo. Media cerca de 8 a 9 metros de comprimento, 3 metros de altura e -pesava menos de 2 toneladas. O Albertossauro viveu na América do Norte e foi -descoberto no ano de 1884 por Joseph Burr Tyrrell em Alberta, no Canadá, local -ao qual deve seu nome. diff --git a/tests/testdata/ro/ibm852.txt b/tests/testdata/ro/ibm852.txt deleted file mode 100644 index 634dda2..0000000 --- a/tests/testdata/ro/ibm852.txt +++ /dev/null @@ -1,9 +0,0 @@ -Danemarca (n danez Sunet Danmark), oficial Regatul Danemarcei (n -danez Sunet Kongeriget Danmark), este un stat suveran din -Europa de Nord, avnd si dou tri constituente de peste mri, care fac parte -integrant din regat: Insulele Feroe n Atlanticul de Nord si Groenlanda n -America de Nord. Danemarca propriu-zis[a] este cea mai de sud dintre trile -nordice, aflat la sud-vest de Suedia si la sud de Norvegia, nvecinndu-se la -sud cu Germania. Tara const dintr-o peninsul mare, Iutlanda, si mai multe -insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster si -Bornholm, precum si sute de insulite denumite n general ,,Arhipelagul Danez". diff --git a/tests/testdata/ro/iso-8859-16.txt b/tests/testdata/ro/iso-8859-16.txt deleted file mode 100644 index 29ae299..0000000 --- a/tests/testdata/ro/iso-8859-16.txt +++ /dev/null @@ -1,9 +0,0 @@ -Danemarca (n danez Sunet Danmark), oficial Regatul Danemarcei (n -danez Sunet Kongeriget Danmark), este un stat suveran din -Europa de Nord, avnd i dou ri constituente de peste mri, care fac parte -integrant din regat: Insulele Feroe n Atlanticul de Nord i Groenlanda n -America de Nord. Danemarca propriu-zis[a] este cea mai de sud dintre rile -nordice, aflat la sud-vest de Suedia i la sud de Norvegia, nvecinndu-se la -sud cu Germania. ara const dintr-o peninsul mare, Iutlanda, i mai multe -insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster i -Bornholm, precum i sute de insulie denumite n general Arhipelagul Danez. diff --git a/tests/testdata/ro/utf-8.txt b/tests/testdata/ro/utf-8.txt deleted file mode 100644 index dea759e..0000000 --- a/tests/testdata/ro/utf-8.txt +++ /dev/null @@ -1,9 +0,0 @@ -Danemarca (în daneză Sunet Danmark), oficial Regatul Danemarcei (în -daneză Sunet Kongeriget Danmark), este un stat suveran din -Europa de Nord, având și două țări constituente de peste mări, care fac parte -integrantă din regat: Insulele Feroe în Atlanticul de Nord și Groenlanda în -America de Nord. Danemarca propriu-zisă[a] este cea mai de sud dintre țările -nordice, aflată la sud-vest de Suedia și la sud de Norvegia, învecinându-se la -sud cu Germania. Țara constă dintr-o peninsulă mare, Iutlanda, și mai multe -insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster și -Bornholm, precum și sute de insulițe denumite în general „Arhipelagul Danez”. diff --git a/tests/testdata/ro/windows-1250.txt b/tests/testdata/ro/windows-1250.txt deleted file mode 100644 index f43cb89..0000000 --- a/tests/testdata/ro/windows-1250.txt +++ /dev/null @@ -1,9 +0,0 @@ -Danemarca (n danez Sunet Danmark), oficial Regatul Danemarcei (n -danez Sunet Kongeriget Danmark), este un stat suveran din -Europa de Nord, avnd si dou tri constituente de peste mri, care fac parte -integrant din regat: Insulele Feroe n Atlanticul de Nord si Groenlanda n -America de Nord. Danemarca propriu-zis[a] este cea mai de sud dintre trile -nordice, aflat la sud-vest de Suedia si la sud de Norvegia, nvecinndu-se la -sud cu Germania. Tara const dintr-o peninsul mare, Iutlanda, si mai multe -insule, dintre care cele mai mari sunt Zealand, Funen, Lolland, Falster si -Bornholm, precum si sute de insulite denumite n general Arhipelagul Danez. diff --git a/tests/testdata/ru/ibm855.txt b/tests/testdata/ru/ibm855.txt deleted file mode 100644 index 6e19fc6..0000000 --- a/tests/testdata/ru/ibm855.txt +++ /dev/null @@ -1,5 +0,0 @@ -CP855 -- ƷзƠ ֦ Է MS-DOS ֦֢ بᠤ . М Ԡ ISO 8859-5. - -ը֬ Р ᢷ, ӠƨԷ Ь᷷, 㷷 Ԩ Р. - -ݨ Ԡ ֽ֦ 堢з ؠ ASCII. Р ֦ ҷ ֢Ԡ Ԡ ֦ Է֦. diff --git a/tests/testdata/ru/ibm866.txt b/tests/testdata/ru/ibm866.txt deleted file mode 100644 index 0dc4775..0000000 --- a/tests/testdata/ru/ibm866.txt +++ /dev/null @@ -1,11 +0,0 @@ -<<ୠ⨢ ஢>> (<<ୠ⨢ ஢ >>) -- ᭮ -CP437 ࠭, ᯥ᪨ ய᪨ ᨬ ன - ਫ, ⠢ ᥢ᪨ ᨬ ஭묨. -⥫쭮, ணࠬ, ᯮ ࠡ ⥪⮢ -, ⠪ ᯥ稢 ᯮ짮 ᨬ ਫ. - -᪨ ⢮ ਠ⮢ ୠ⨢ ஢, -ࠧ ⮫쪮 0xF0 -- 0xFF (240--255). ᯮ짮 -ᥢ 䨪-ᠬ, ᠢ訥 ᨬ 묨 -ணࠬ⠬, ࠭ १ । -業. diff --git a/tests/testdata/ru/iso-8859-5.txt b/tests/testdata/ru/iso-8859-5.txt deleted file mode 100644 index 6246c8d..0000000 --- a/tests/testdata/ru/iso-8859-5.txt +++ /dev/null @@ -1,3 +0,0 @@ -ISO 8859-5 -8- ISO-8859 . -ISO 8859-5 " " ( , ). diff --git a/tests/testdata/ru/koi8-r.txt b/tests/testdata/ru/koi8-r.txt deleted file mode 100644 index 1972c27..0000000 --- a/tests/testdata/ru/koi8-r.txt +++ /dev/null @@ -1 +0,0 @@ --8 ( , 8 ), KOI8 - , ASCII. . Unix- , 2010 , , . diff --git a/tests/testdata/ru/maccyrillic.txt b/tests/testdata/ru/maccyrillic.txt deleted file mode 100644 index ad849a3..0000000 --- a/tests/testdata/ru/maccyrillic.txt +++ /dev/null @@ -1,9 +0,0 @@ - MacCyrillic nj. - - - ; , - , ( ), -, . - - () -ASCII. . diff --git a/tests/testdata/ru/windows-1251.txt b/tests/testdata/ru/windows-1251.txt deleted file mode 100644 index c76f0be..0000000 --- a/tests/testdata/ru/windows-1251.txt +++ /dev/null @@ -1,4 +0,0 @@ -Windows-1251 - , 8- Microsoft Windows. . , Windows 19901991 . , Microsoft. ( , ). - -Windows-1251 8- ( CP866, KOI8-R ISO 8859-5) , ( ); : , , , . diff --git a/tests/testdata/sk/ibm852.txt b/tests/testdata/sk/ibm852.txt deleted file mode 100644 index 725a54f..0000000 --- a/tests/testdata/sk/ibm852.txt +++ /dev/null @@ -1,3 +0,0 @@ -Jupiter je piata planta v porad od Slnka, najvia a najhmotnejia planta -naej slnenej sstavy. Je pomenovan po rmskom bohovi Jupiterovi. Symbolom -planty je tylizovan znzornenie Jupiterovho boskho blesku. diff --git a/tests/testdata/sk/iso-8859-2.txt b/tests/testdata/sk/iso-8859-2.txt deleted file mode 100644 index ee3ab14..0000000 --- a/tests/testdata/sk/iso-8859-2.txt +++ /dev/null @@ -1,3 +0,0 @@ -Jupiter je piata planta v porad od Slnka, najvia a najhmotnejia planta -naej slnenej sstavy. Je pomenovan po rmskom bohovi Jupiterovi. Symbolom -planty je tylizovan znzornenie Jupiterovho boskho blesku. diff --git a/tests/testdata/sk/maccentraleurope.txt b/tests/testdata/sk/maccentraleurope.txt deleted file mode 100644 index cddbba4..0000000 --- a/tests/testdata/sk/maccentraleurope.txt +++ /dev/null @@ -1,3 +0,0 @@ -Jupiter je piata planta v porad od Slnka, najvia a najhmotnejia planta -naej slnenej sstavy. Je pomenovan po rmskom bohovi Jupiterovi. Symbolom -planty je tylizovan znzornenie Jupiterovho boskho blesku. diff --git a/tests/testdata/sk/utf-8.txt b/tests/testdata/sk/utf-8.txt deleted file mode 100644 index eba4382..0000000 --- a/tests/testdata/sk/utf-8.txt +++ /dev/null @@ -1,3 +0,0 @@ -Jupiter je piata planéta v poradí od Slnka, najväčšia a najhmotnejšia planéta -našej slnečnej sústavy. Je pomenovaný po rímskom bohovi Jupiterovi. Symbolom -planéty je štylizované znázornenie Jupiterovho božského blesku. diff --git a/tests/testdata/sk/windows-1250.txt b/tests/testdata/sk/windows-1250.txt deleted file mode 100644 index a60d048..0000000 --- a/tests/testdata/sk/windows-1250.txt +++ /dev/null @@ -1,3 +0,0 @@ -Jupiter je piata planta v porad od Slnka, najvia a najhmotnejia planta -naej slnenej sstavy. Je pomenovan po rmskom bohovi Jupiterovi. Symbolom -planty je tylizovan znzornenie Jupiterovho boskho blesku. diff --git a/tests/testdata/sl/ibm852.txt b/tests/testdata/sl/ibm852.txt deleted file mode 100644 index 5fa60a4..0000000 --- a/tests/testdata/sl/ibm852.txt +++ /dev/null @@ -1,9 +0,0 @@ -Naseljvi plant je planet ali naravni satelit (redkeje tudi asteroid[1]), ki je -zmoen razviti in ohranjati ivljenje. - -Ker je obstoj nezemeljskega ivljenja trenutno negotov, je raziskovanje -naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znailnosti -Sonca in celotnega Osonja, ki govorijo v prid razvitju ivljenja. e posebej so -pomembni faktorji, ki so ohranili zapletene, mnogoceline organizme in ne le -preprosta, enocelina iva bitja, mikroorganizme. Raziskovanje in teorija v tej -smeri je del planetologije in razvijajoe astrobiologije. diff --git a/tests/testdata/sl/iso-8859-16.txt b/tests/testdata/sl/iso-8859-16.txt deleted file mode 100644 index 80d0b26..0000000 --- a/tests/testdata/sl/iso-8859-16.txt +++ /dev/null @@ -1,9 +0,0 @@ -Naseljvi plant je planet ali naravni satelit (redkeje tudi asteroid[1]), ki je -zmoen razviti in ohranjati ivljenje. - -Ker je obstoj nezemeljskega ivljenja trenutno negotov, je raziskovanje -naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znailnosti -Sonca in celotnega Osonja, ki govorijo v prid razvitju ivljenja. e posebej so -pomembni faktorji, ki so ohranili zapletene, mnogoceline organizme in ne le -preprosta, enocelina iva bitja, mikroorganizme. Raziskovanje in teorija v tej -smeri je del planetologije in razvijajoe astrobiologije. diff --git a/tests/testdata/sl/iso-8859-2.txt b/tests/testdata/sl/iso-8859-2.txt deleted file mode 100644 index 7af252e..0000000 --- a/tests/testdata/sl/iso-8859-2.txt +++ /dev/null @@ -1,9 +0,0 @@ -Naseljvi plant je planet ali naravni satelit (redkeje tudi asteroid[1]), ki je -zmoen razviti in ohranjati ivljenje. - -Ker je obstoj nezemeljskega ivljenja trenutno negotov, je raziskovanje -naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znailnosti -Sonca in celotnega Osonja, ki govorijo v prid razvitju ivljenja. e posebej so -pomembni faktorji, ki so ohranili zapletene, mnogoceline organizme in ne le -preprosta, enocelina iva bitja, mikroorganizme. Raziskovanje in teorija v tej -smeri je del planetologije in razvijajoe astrobiologije. diff --git a/tests/testdata/sl/maccentraleurope.txt b/tests/testdata/sl/maccentraleurope.txt deleted file mode 100644 index 4e84135..0000000 --- a/tests/testdata/sl/maccentraleurope.txt +++ /dev/null @@ -1,9 +0,0 @@ -Naseljvi plant je planet ali naravni satelit (redkeje tudi asteroid[1]), ki je -zmoen razviti in ohranjati ivljenje. - -Ker je obstoj nezemeljskega ivljenja trenutno negotov, je raziskovanje -naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znailnosti -Sonca in celotnega Osonja, ki govorijo v prid razvitju ivljenja. e posebej so -pomembni faktorji, ki so ohranili zapletene, mnogoceline organizme in ne le -preprosta, enocelina iva bitja, mikroorganizme. Raziskovanje in teorija v tej -smeri je del planetologije in razvijajoe astrobiologije. diff --git a/tests/testdata/sl/utf-8.txt b/tests/testdata/sl/utf-8.txt deleted file mode 100644 index 11d013b..0000000 --- a/tests/testdata/sl/utf-8.txt +++ /dev/null @@ -1,9 +0,0 @@ -Naseljívi planét je planet ali naravni satelit (redkeje tudi asteroid[1]), ki je -zmožen razviti in ohranjati življenje. - -Ker je obstoj nezemeljskega življenja trenutno negotov, je raziskovanje -naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in značilnosti -Sonca in celotnega Osončja, ki govorijo v prid razvitju življenja. Še posebej so -pomembni faktorji, ki so ohranili zapletene, mnogocelične organizme in ne le -preprosta, enocelična živa bitja, mikroorganizme. Raziskovanje in teorija v tej -smeri je del planetologije in razvijajoče astrobiologije. diff --git a/tests/testdata/sl/windows-1250.txt b/tests/testdata/sl/windows-1250.txt deleted file mode 100644 index 512309b..0000000 --- a/tests/testdata/sl/windows-1250.txt +++ /dev/null @@ -1,9 +0,0 @@ -Naseljvi plant je planet ali naravni satelit (redkeje tudi asteroid[1]), ki je -zmoen razviti in ohranjati ivljenje. - -Ker je obstoj nezemeljskega ivljenja trenutno negotov, je raziskovanje -naseljivih planetov v glavnem ekstrapolacija razmer na Zemlji in znailnosti -Sonca in celotnega Osonja, ki govorijo v prid razvitju ivljenja. e posebej so -pomembni faktorji, ki so ohranili zapletene, mnogoceline organizme in ne le -preprosta, enocelina iva bitja, mikroorganizme. Raziskovanje in teorija v tej -smeri je del planetologije in razvijajoe astrobiologije. diff --git a/tests/testdata/sv/iso-8859-1.txt b/tests/testdata/sv/iso-8859-1.txt deleted file mode 100644 index fcf070c..0000000 --- a/tests/testdata/sv/iso-8859-1.txt +++ /dev/null @@ -1,10 +0,0 @@ -Mlle r en ttort p Kullahalvn i Brunnby socken i Hgans kommun, Skne ln. - -Samhllet var frn brjan ett fiskelge, men kom att spela en stor roll i den -framvxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog - och -bidrar - Mlles naturskna lge invid resunds norra utlopp, med Kullaberg som -bakgrund. Gemensamhetsbad fr mn och kvinnor introducerades i Ransvik i brjan -av 1900-talet. Storhetstiden som turistort intrffade strax fre frsta -vrldskriget, men ven under mellankrigstiden var turiststrmmarna stora. -Fortfarande r Mlle en populr turistort med en tredubbling av invnarantalet -under sommarmnaderna. diff --git a/tests/testdata/sv/utf-8.txt b/tests/testdata/sv/utf-8.txt deleted file mode 100644 index d66be04..0000000 --- a/tests/testdata/sv/utf-8.txt +++ /dev/null @@ -1,10 +0,0 @@ -Mölle är en tätort på Kullahalvön i Brunnby socken i Höganäs kommun, Skåne län. - -Samhället var från början ett fiskeläge, men kom att spela en stor roll i den -framväxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog – och -bidrar – Mölles natursköna läge invid Öresunds norra utlopp, med Kullaberg som -bakgrund. Gemensamhetsbad för män och kvinnor introducerades i Ransvik i början -av 1900-talet. Storhetstiden som turistort inträffade strax före första -världskriget, men även under mellankrigstiden var turistströmmarna stora. -Fortfarande är Mölle en populär turistort med en tredubbling av invånarantalet -under sommarmånaderna. diff --git a/tests/testdata/sv/windows-1252.txt b/tests/testdata/sv/windows-1252.txt deleted file mode 100644 index 94f15c6..0000000 --- a/tests/testdata/sv/windows-1252.txt +++ /dev/null @@ -1,10 +0,0 @@ -Mlle r en ttort p Kullahalvn i Brunnby socken i Hgans kommun, Skne ln. - -Samhllet var frn brjan ett fiskelge, men kom att spela en stor roll i den -framvxande turismen i Sverige i slutet av 1800-talet. Till detta bidrog och -bidrar Mlles naturskna lge invid resunds norra utlopp, med Kullaberg som -bakgrund. Gemensamhetsbad fr mn och kvinnor introducerades i Ransvik i brjan -av 1900-talet. Storhetstiden som turistort intrffade strax fre frsta -vrldskriget, men ven under mellankrigstiden var turiststrmmarna stora. -Fortfarande r Mlle en populr turistort med en tredubbling av invnarantalet -under sommarmnaderna. diff --git a/tests/testdata/th/iso-8859-11.txt b/tests/testdata/th/iso-8859-11.txt deleted file mode 100644 index 14deb7c..0000000 --- a/tests/testdata/th/iso-8859-11.txt +++ /dev/null @@ -1,5 +0,0 @@ -TIS-620 - -ҵðҹԵѳصˡ 620-2533, ͡.620-2533, ͷѡѹ TIS-620 繪شѡҵðҹصˡͧ ժ Ѻѡ·Ѻ - - TIS-620 ´ ISO-8859-11 ҡ ᵡҧѹ§ ISO-8859-11 ˹ A0 "äẺѴ" (no-break space) ǹ TIS-620 ʧǹ˹ A0 ˹ diff --git a/tests/testdata/th/tis-620.txt b/tests/testdata/th/tis-620.txt deleted file mode 100644 index 0439613..0000000 --- a/tests/testdata/th/tis-620.txt +++ /dev/null @@ -1,5 +0,0 @@ -TIS-620 - -ҵðҹԵѳصˡ 620-2533, ͡.620-2533, ͷѡѹ TIS-620 繪شѡҵðҹصˡͧ ժ Ѻѡ·Ѻ - - TIS-620 ´ ISO-8859-11 ҡ ᵡҧѹ§ ISO-8859-11 ˹ A0 "äẺѴ" (no-break space) ǹ TIS-620 ʧǹ˹ A0 ˹ diff --git a/tests/testdata/th/utf-8.txt b/tests/testdata/th/utf-8.txt deleted file mode 100644 index 73a9c1a..0000000 --- a/tests/testdata/th/utf-8.txt +++ /dev/null @@ -1 +0,0 @@ -ยูนิโคด (อังกฤษ: Unicode) คือมาตรฐานอุตสาหกรรมที่ช่วยให้คอมพิวเตอร์แสดงผลและจัดการข้อความธรรมดาที่ใช้ในระบบการเขียนของภาษาส่วนใหญ่ในโลกได้อย่างสอดคล้องกัน ยูนิโคดประกอบด้วยรายการอักขระที่แสดงผลได้มากกว่า 100,000 ตัว พัฒนาต่อยอดมาจากมาตรฐานชุดอักขระสากล (Universal Character Set: UCS) และมีการตีพิมพ์ลงในหนังสือ The Unicode Standard เป็นแผนผังรหัสเพื่อใช้เป็นรายการอ้างอิง นอกจากนั้นยังมีการอธิบายวิธีการที่ใช้เข้ารหัสและการนำเสนอมาตรฐานของการเข้ารหัสอักขระอีกจำนวนหนึ่ง การเรียงลำดับอักษร กฎเกณฑ์ของการรวมและการแยกอักขระ รวมไปถึงลำดับการแสดงผลของอักขระสองทิศทาง (เช่นอักษรอาหรับหรืออักษรฮีบรูที่เขียนจากขวาไปซ้าย) diff --git a/tests/testdata/tr/iso-8859-3.txt b/tests/testdata/tr/iso-8859-3.txt deleted file mode 100644 index 0cb6dc0..0000000 --- a/tests/testdata/tr/iso-8859-3.txt +++ /dev/null @@ -1,13 +0,0 @@ -Trke, Trk dili ya da Trkiye Trkesi, batda Balkanlar'dan balayp douda -Hazar Denizi sahasna kadar konuulan Altay dillerinden biridir. Ya, en eski -hesaplara gre 8500 olan Trke, bugn yaayan Dnya dilleri arasnda en eski -yazl belgelere sahip olan dildir. Bu belgeler, ivi yazl Smerce -tabletlerdeki alnt kelimelerdir.[12] Trk yaz dilleri iinde Ouz sahas yaz -dillerinden Osmanl Trkesinin devamn oluturur. Bata Trkiye olmak zere -eski Osmanl mparatorluu corafyasnda konuulan Trke, dnyada en fazla -konuulan 5. dildir. Trke sondan eklemeli bir dildir.[13] Bundan tr -kullanlan herhangi bir eylem zerinden istenildii kadar szck -tretilebilir.[14] Trkiye Trkesi bu ynnden dolay dier Trk dilleriyle -ortak ya da ayrk bulunan onlarca eke sahiptir.[15] Trke ok geni -kullanmyla birlikte zengin bir dil olmasnn yan sra, genel itibaryla -"zne-nesne-yklem" biimindeki cmle kuruluuna sahiptir. diff --git a/tests/testdata/tr/iso-8859-9.txt b/tests/testdata/tr/iso-8859-9.txt deleted file mode 100644 index 4a69aa3..0000000 --- a/tests/testdata/tr/iso-8859-9.txt +++ /dev/null @@ -1,13 +0,0 @@ -Trke, Trk dili ya da Trkiye Trkesi, batda Balkanlar'dan balayp douda -Hazar Denizi sahasna kadar konuulan Altay dillerinden biridir. Ya, en eski -hesaplara gre 8500 olan Trke, bugn yaayan Dnya dilleri arasnda en eski -yazl belgelere sahip olan dildir. Bu belgeler, ivi yazl Smerce -tabletlerdeki alnt kelimelerdir.[12] Trk yaz dilleri iinde Ouz sahas yaz -dillerinden Osmanl Trkesinin devamn oluturur. Bata Trkiye olmak zere -eski Osmanl mparatorluu corafyasnda konuulan Trke, dnyada en fazla -konuulan 5. dildir. Trke sondan eklemeli bir dildir.[13] Bundan tr -kullanlan herhangi bir eylem zerinden istenildii kadar szck -tretilebilir.[14] Trkiye Trkesi bu ynnden dolay dier Trk dilleriyle -ortak ya da ayrk bulunan onlarca eke sahiptir.[15] Trke ok geni -kullanmyla birlikte zengin bir dil olmasnn yan sra, genel itibaryla -"zne-nesne-yklem" biimindeki cmle kuruluuna sahiptir. diff --git a/tests/testdata/vi/utf-8.txt b/tests/testdata/vi/utf-8.txt deleted file mode 100644 index c82798d..0000000 --- a/tests/testdata/vi/utf-8.txt +++ /dev/null @@ -1,4 +0,0 @@ -Chữ Quốc ngữ là hệ chữ viết thống nhất chính thức hiện nay của tiếng Việt, sử -dụng ký tự La Tinh, dựa trên các bảng chữ cái của nhóm ngôn ngữ Rôman,[1] đặc -biệt là bảng chữ cái Bồ Đào Nha,[2] với các dấu phụ chủ yếu từ bảng chữ cái Hy -Lạp. diff --git a/tests/testdata/vi/viscii.txt b/tests/testdata/vi/viscii.txt deleted file mode 100644 index ef1e187..0000000 --- a/tests/testdata/vi/viscii.txt +++ /dev/null @@ -1,4 +0,0 @@ -Ch Quc ng l h ch vit thng nht chnh thc hin nay ca ting Vit, s -dng k t La Tinh, da trn cc bng ch ci ca nhm ngn ng Rman,[1] c -bit l bng ch ci B o Nha,[2] vi cc du ph ch yu t bng ch ci Hy -Lp. diff --git a/tests/testdata/vi/windows-1258.txt b/tests/testdata/vi/windows-1258.txt deleted file mode 100644 index 53526db..0000000 --- a/tests/testdata/vi/windows-1258.txt +++ /dev/null @@ -1,4 +0,0 @@ -Ch Quc ng l h ch vit thng nht chnh thc hin nay cua ting Vit, s -dung ky t La Tinh, da trn cc bang ch ci cua nhm ngn ng Rman,[1] c -bit l bang ch ci B o Nha,[2] vi cc du phu chu yu t bang ch ci Hy -Lap. diff --git a/tests/testdata/zh/big5.txt b/tests/testdata/zh/big5.txt deleted file mode 100644 index 59db954..0000000 --- a/tests/testdata/zh/big5.txt +++ /dev/null @@ -1 +0,0 @@ -c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤c餤 \ No newline at end of file diff --git a/tests/testdata/zh/euc-tw.txt b/tests/testdata/zh/euc-tw.txt deleted file mode 100644 index ba30a9a..0000000 --- a/tests/testdata/zh/euc-tw.txt +++ /dev/null @@ -1 +0,0 @@ -EUC-TWҳƺġCNS 11643??ƺEUC-TWŷ diff --git a/tests/testdata/zh/gb18030.txt b/tests/testdata/zh/gb18030.txt deleted file mode 100644 index 962df87..0000000 --- a/tests/testdata/zh/gb18030.txt +++ /dev/null @@ -1 +0,0 @@ -ļļļļļļļļļļ \ No newline at end of file diff --git a/tests/testdata/zh/utf-8.txt b/tests/testdata/zh/utf-8.txt deleted file mode 100644 index cd66b08..0000000 --- a/tests/testdata/zh/utf-8.txt +++ /dev/null @@ -1 +0,0 @@ -汉字漢字統一編碼萬國碼