Skip to content

Commit

Permalink
refactor: uchardetのテストデータを利用したテストへ変更する
Browse files Browse the repository at this point in the history
  • Loading branch information
PyYoshi committed Jun 11, 2024
1 parent 8f60dc4 commit 6e52a12
Show file tree
Hide file tree
Showing 122 changed files with 50 additions and 702 deletions.
84 changes: 50 additions & 34 deletions tests/test_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,50 +4,67 @@
import cchardet

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))

SKIP_LIST = [
"testdata/ja/utf-16le.txt",
"testdata/ja/utf-16be.txt",
"testdata/es/iso-8859-15.txt",
"testdata/da/iso-8859-1.txt",
"testdata/he/iso-8859-8.txt",
TESTDATA_DIR = os.path.join(SCRIPT_DIR, "..", "src", "ext", "uchardet", "test")

SKIP_LIST_DETECT = [
"zh/gb18030.txt",

# These are tests known to fail (not supported or not efficient
# enough). We will have to take a closer look and fix these, but
# there is no need to break the whole `make test` right now,
# which may make actual regressions harder to notice.
"ja/utf-16le.txt",
"ja/utf-16be.txt",
"es/iso-8859-15.txt",
"da/iso-8859-1.txt",
"he/iso-8859-8.txt",
]

# Python can"t decode encoding
SKIP_LIST_02 = [
"testdata/vi/viscii.txt",
"testdata/zh/euc-tw.txt",
SKIP_LIST_DEC = [
"ka/georgian-academy.txt",
"ka/georgian-ps.txt",
"vi/viscii.txt",
"zh/euc-tw.txt",
]
SKIP_LIST_02.extend(SKIP_LIST)

SKIP_LIST_DEC.extend(SKIP_LIST_DETECT)

class TestCChardet:
def test_ascii(self):
detected_encoding = cchardet.detect(b"abcdefghijklmnopqrstuvwxyz")
assert "ascii" == detected_encoding["encoding"].lower(), "Expected %s, but got %s" % (
got_enc = None
if detected_encoding["encoding"] is not None:
got_enc = detected_encoding["encoding"].lower()
assert "ascii" == got_enc, "Expected %s, but got %s" % (
"ascii",
detected_encoding["encoding"].lower(),
got_enc,
)

def test_detect(self):
testfiles = glob.glob(SCRIPT_DIR + "/testdata/*/*.txt")
testfiles = glob.glob(TESTDATA_DIR + "/*/*.txt")
for testfile in testfiles:
if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST):
if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST_DETECT):
print("Skip: %s" % testfile)
continue

base = os.path.basename(testfile)
expected_charset = os.path.splitext(base)[0]
expected_charset = expected_charset.split(".")[0]
if expected_charset == "mac-centraleurope":
expected_charset = "maccentraleurope"
with open(testfile, "rb") as f:
msg = f.read()
detected_encoding = cchardet.detect(msg)
print("Test %s: %s" % (testfile, detected_encoding))
assert detected_encoding["encoding"] is not None, (
got_enc = None
if detected_encoding["encoding"] is not None:
got_enc = detected_encoding["encoding"].lower()
assert got_enc is not None, (
'Expected %s, but got None for "%s"' % (expected_charset.lower(), testfile)
)
assert expected_charset.lower() == detected_encoding["encoding"].lower(), (
assert expected_charset.lower() == got_enc, (
'Expected %s, but got %s for "%s"'
% (expected_charset.lower(), detected_encoding["encoding"].lower(), testfile)
% (expected_charset.lower(), got_enc, testfile)
)

def test_detector(self):
Expand All @@ -64,9 +81,12 @@ def test_detector(self):
line = f.readline()
detector.close()
detected_encoding = detector.result
assert "shift_jis" == detected_encoding["encoding"].lower(), "Expected %s, but got %s" % (
got_enc = None
if detected_encoding["encoding"] is not None:
got_enc = detected_encoding["encoding"].lower()
assert "shift_jis" == got_enc, "Expected %s, but got %s" % (
"shift_jis",
detected_encoding["encoding"].lower(),
got_enc,
)

def test_github_issue_20(self):
Expand All @@ -82,15 +102,16 @@ def test_github_issue_20(self):
detector.close()

def test_decode(self):
testfiles = glob.glob(SCRIPT_DIR + "/testdata/*/*.txt")
testfiles = glob.glob(TESTDATA_DIR + "/*/*.txt")
for testfile in testfiles:
if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST_02):
if any(testfile.replace("\\", "/").endswith(skip) for skip in SKIP_LIST_DEC):
print("Skip: %s" % testfile)
continue

with open(testfile, "rb") as f:
msg = f.read()
detected_encoding = cchardet.detect(msg)
print("Test %s: %s" % (testfile, detected_encoding))
try:
msg.decode(detected_encoding["encoding"])
except LookupError as e:
Expand All @@ -103,15 +124,10 @@ def test_decode(self):
def test_utf8_with_bom(self):
sample = b"\xef\xbb\xbf"
detected_encoding = cchardet.detect(sample)
assert "utf-8-sig" == detected_encoding["encoding"].lower(), "Expected %s, but got %s" % (
"utf-8-sig",
detected_encoding["encoding"].lower(),
)

def test_null_bytes(self):
sample = b"ABC\x00\x80\x81"
detected_encoding = cchardet.detect(sample)

assert detected_encoding["encoding"] is None, (
"Expected None, but got %s" % (detected_encoding["encoding"])
got_enc = None
if detected_encoding["encoding"] is not None:
got_enc = detected_encoding["encoding"].lower()
assert "utf-8" == got_enc, "Expected %s, but got %s" % (
"utf-8",
got_enc,
)
3 changes: 0 additions & 3 deletions tests/testdata/ar/iso-8859-6.txt

This file was deleted.

3 changes: 0 additions & 3 deletions tests/testdata/ar/utf-8.txt

This file was deleted.

3 changes: 0 additions & 3 deletions tests/testdata/ar/windows-1256.txt

This file was deleted.

3 changes: 0 additions & 3 deletions tests/testdata/bg/windows-1251.txt

This file was deleted.

4 changes: 0 additions & 4 deletions tests/testdata/cs/ibm852.txt

This file was deleted.

4 changes: 0 additions & 4 deletions tests/testdata/cs/iso-8859-2.txt

This file was deleted.

4 changes: 0 additions & 4 deletions tests/testdata/cs/maccentraleurope.txt

This file was deleted.

4 changes: 0 additions & 4 deletions tests/testdata/cs/utf-8.txt

This file was deleted.

4 changes: 0 additions & 4 deletions tests/testdata/cs/windows-1250.txt

This file was deleted.

7 changes: 0 additions & 7 deletions tests/testdata/da/iso-8859-1.txt

This file was deleted.

10 changes: 0 additions & 10 deletions tests/testdata/da/iso-8859-15.txt

This file was deleted.

10 changes: 0 additions & 10 deletions tests/testdata/da/utf-8.txt

This file was deleted.

10 changes: 0 additions & 10 deletions tests/testdata/da/windows-1252.txt

This file was deleted.

11 changes: 0 additions & 11 deletions tests/testdata/de/iso-8859-1.txt

This file was deleted.

11 changes: 0 additions & 11 deletions tests/testdata/de/windows-1252.txt

This file was deleted.

3 changes: 0 additions & 3 deletions tests/testdata/el/iso-8859-7.txt

This file was deleted.

3 changes: 0 additions & 3 deletions tests/testdata/el/utf-8.txt

This file was deleted.

5 changes: 0 additions & 5 deletions tests/testdata/el/windows-1253.txt

This file was deleted.

4 changes: 0 additions & 4 deletions tests/testdata/en/ascii.txt

This file was deleted.

7 changes: 0 additions & 7 deletions tests/testdata/eo/iso-8859-3.txt

This file was deleted.

5 changes: 0 additions & 5 deletions tests/testdata/es/iso-8859-1.txt

This file was deleted.

5 changes: 0 additions & 5 deletions tests/testdata/es/iso-8859-15.txt

This file was deleted.

5 changes: 0 additions & 5 deletions tests/testdata/es/utf-8.txt

This file was deleted.

5 changes: 0 additions & 5 deletions tests/testdata/es/windows-1252.txt

This file was deleted.

6 changes: 0 additions & 6 deletions tests/testdata/et/iso-8859-13.txt

This file was deleted.

6 changes: 0 additions & 6 deletions tests/testdata/et/iso-8859-15.txt

This file was deleted.

6 changes: 0 additions & 6 deletions tests/testdata/et/iso-8859-4.txt

This file was deleted.

6 changes: 0 additions & 6 deletions tests/testdata/et/utf-8.txt

This file was deleted.

6 changes: 0 additions & 6 deletions tests/testdata/et/windows-1252.txt

This file was deleted.

6 changes: 0 additions & 6 deletions tests/testdata/et/windows-1257.txt

This file was deleted.

8 changes: 0 additions & 8 deletions tests/testdata/fi/iso-8859-1.txt

This file was deleted.

8 changes: 0 additions & 8 deletions tests/testdata/fi/utf-8.txt

This file was deleted.

Loading

0 comments on commit 6e52a12

Please sign in to comment.