Skip to content

Commit

Permalink
feat: uchardetの最新版に追従する
Browse files Browse the repository at this point in the history
  • Loading branch information
PyYoshi committed Jun 11, 2024
1 parent 502170a commit af1bf1d
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 23 deletions.
38 changes: 25 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,54 +12,66 @@
sources = cchardet_sources

uchardet_sources = [
os.path.join(uchardet_dir, "CharDistribution.cpp"),
os.path.join(uchardet_dir, "JpCntx.cpp"),
os.path.join(uchardet_dir, "LangModels/LangArabicModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangBelarusianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangBulgarianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangCatalanModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangCroatianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangCzechModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangEnglishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangEsperantoModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangEstonianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangFinnishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangFrenchModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangGeorgianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangGermanModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangGreekModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangHebrewModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangHindiModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangIrishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangItalianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangLatvianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangMacedonianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangMalteseModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangNorwegianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangPolishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangPortugueseModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangRomanianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangRussianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSerbianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSlovakModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSloveneModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSpanishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangThaiModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangTurkishModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangUkrainianModel.cpp"),
os.path.join(uchardet_dir, "LangModels/LangVietnameseModel.cpp"),
os.path.join(uchardet_dir, "nsHebrewProber.cpp"),
os.path.join(uchardet_dir, "nsCharSetProber.cpp"),
os.path.join(uchardet_dir, "CharDistribution.cpp"),
os.path.join(uchardet_dir, "JpCntx.cpp"),
os.path.join(uchardet_dir, "nsBig5Prober.cpp"),
os.path.join(uchardet_dir, "nsCharSetProber.cpp"),
os.path.join(uchardet_dir, "nsCJKDetector.cpp"),
os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"),
os.path.join(uchardet_dir, "nsEscSM.cpp"),
os.path.join(uchardet_dir, "nsEUCJPProber.cpp"),
os.path.join(uchardet_dir, "nsEUCKRProber.cpp"),
os.path.join(uchardet_dir, "nsEUCTWProber.cpp"),
os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"),
os.path.join(uchardet_dir, "nsEscSM.cpp"),
os.path.join(uchardet_dir, "nsGB2312Prober.cpp"),
os.path.join(uchardet_dir, "nsHebrewProber.cpp"),
os.path.join(uchardet_dir, "nsJohabProber.cpp"),
os.path.join(uchardet_dir, "nsLanguageDetector.cpp"),
os.path.join(uchardet_dir, "nsLatin1Prober.cpp"),
os.path.join(uchardet_dir, "nsMBCSGroupProber.cpp"),
os.path.join(uchardet_dir, "nsMBCSSM.cpp"),
os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"),
os.path.join(uchardet_dir, "nsSBCharSetProber.cpp"),
os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"),
os.path.join(uchardet_dir, "nsSJISProber.cpp"),
os.path.join(uchardet_dir, "nsUTF8Prober.cpp"),
os.path.join(uchardet_dir, "nsLatin1Prober.cpp"),
os.path.join(uchardet_dir, "nsUniversalDetector.cpp"),
os.path.join(uchardet_dir, "nsUTF8Prober.cpp"),
os.path.join(uchardet_dir, "uchardet.cpp"),
]
sources += uchardet_sources
Expand Down
8 changes: 6 additions & 2 deletions src/cchardet/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from . import _cchardet

version = (2, 2, 0, "alpha", 2)
__version__ = "2.2.0a2"
version = (2, 2, 0, "alpha", 3)
__version__ = "2.2.0a3"


def detect(msg):
Expand All @@ -17,6 +17,10 @@ def detect(msg):
encoding, confidence = _cchardet.detect_with_confidence(msg)
if isinstance(encoding, bytes):
encoding = encoding.decode()

if encoding == "MAC-CENTRALEUROPE":
encoding = "maccentraleurope"

return {"encoding": encoding, "confidence": confidence}


Expand Down
40 changes: 32 additions & 8 deletions src/cchardet/_cchardet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,24 @@

cdef extern from *:
ctypedef char* const_char_ptr "const char*"
ctypedef unsigned long size_t

# uchardet v0.0.8
cdef extern from "uchardet.h":
ctypedef void* uchardet_t
cdef uchardet_t uchardet_new()
cdef void uchardet_delete(uchardet_t ud)
cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, int length)
cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, size_t length)
cdef void uchardet_data_end(uchardet_t ud)
cdef void uchardet_reset(uchardet_t ud)
cdef const_char_ptr uchardet_get_charset(uchardet_t ud)
cdef float uchardet_get_confidence(uchardet_t ud)
cdef float uchardet_get_confidence(uchardet_t ud, size_t i)
# cdef const_char_ptr uchardet_get_encoding(uchardet_t ud, size_t i)
# cdef const_char_ptr uchardet_get_language(uchardet_t ud, size_t i)

def detect_with_confidence(bytes msg):
cdef int length = len(msg)
cdef size_t length = len(msg)

cdef uchardet_t ud = uchardet_new()

cdef int result = uchardet_handle_data(ud, msg, length)
Expand All @@ -26,8 +30,17 @@ def detect_with_confidence(bytes msg):

uchardet_data_end(ud)

cdef bytes detected_charset = uchardet_get_charset(ud)
cdef float detected_confidence = uchardet_get_confidence(ud)
cdef bytes detected_charset
# cdef bytes detected_encoding
# cdef const_char_ptr detected_language
cdef float detected_confidence

detected_charset = uchardet_get_charset(ud)
# detected_encoding = uchardet_get_encoding(ud, 0)
# detected_language = uchardet_get_language(ud, 0)
detected_confidence = uchardet_get_confidence(ud, 0)

uchardet_reset(ud)
uchardet_delete(ud)

if detected_charset:
Expand All @@ -40,20 +53,26 @@ cdef class UniversalDetector:
cdef int _done
cdef int _closed
cdef bytes _detected_charset
# cdef bytes _detected_encoding
# cdef const_char_ptr _detected_language
cdef float _detected_confidence

def __init__(self):
self._ud = uchardet_new()
self._done = 0
self._closed = 0
self._detected_charset = b""
# self._detected_encoding = b""
# self._detected_language = b""
self._detected_confidence = 0.0

def reset(self):
if not self._closed:
self._done = 0
self._closed = 0
self._detected_charset = b""
# self._detected_encoding = b""
# self._detected_language = b""
self._detected_confidence = 0.0
uchardet_reset(self._ud)

Expand All @@ -76,13 +95,18 @@ cdef class UniversalDetector:
self._done = 1

self._detected_charset = uchardet_get_charset(self._ud)
self._detected_confidence = uchardet_get_confidence(self._ud)
# self._detected_encoding = uchardet_get_encoding(self._ud, 0)
# self._detected_language = uchardet_get_language(self._ud, 0)
self._detected_confidence = uchardet_get_confidence(self._ud, 0)

def close(self):
if not self._closed:
uchardet_data_end(self._ud)

self._detected_charset = uchardet_get_charset(self._ud)
self._detected_confidence = uchardet_get_confidence(self._ud)
# self._detected_encoding = uchardet_get_encoding(self._ud, 0)
# self._detected_language = uchardet_get_language(self._ud, 0)
self._detected_confidence = uchardet_get_confidence(self._ud, 0)

uchardet_delete(self._ud)
self._closed = 1
Expand Down

0 comments on commit af1bf1d

Please sign in to comment.