From af1bf1d93da042184dd1bd38a15d83501f8aa4f3 Mon Sep 17 00:00:00 2001 From: PyYoshi Date: Tue, 11 Jun 2024 19:45:18 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20uchardet=E3=81=AE=E6=9C=80=E6=96=B0?= =?UTF-8?q?=E7=89=88=E3=81=AB=E8=BF=BD=E5=BE=93=E3=81=99=E3=82=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- setup.py | 38 +++++++++++++++++++++++------------- src/cchardet/__init__.py | 8 ++++++-- src/cchardet/_cchardet.pyx | 40 ++++++++++++++++++++++++++++++-------- 3 files changed, 63 insertions(+), 23 deletions(-) diff --git a/setup.py b/setup.py index a24ac72..964f0be 100644 --- a/setup.py +++ b/setup.py @@ -12,54 +12,66 @@ sources = cchardet_sources uchardet_sources = [ - os.path.join(uchardet_dir, "CharDistribution.cpp"), - os.path.join(uchardet_dir, "JpCntx.cpp"), os.path.join(uchardet_dir, "LangModels/LangArabicModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangBelarusianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangBulgarianModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangCatalanModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangCroatianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangCzechModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangEnglishModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangEsperantoModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangEstonianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangFinnishModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangFrenchModel.cpp"), - os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangGeorgianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangGermanModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangGreekModel.cpp"), - os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangHebrewModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangHindiModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangIrishModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangItalianModel.cpp"), - os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangLatvianModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangMacedonianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangMalteseModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangNorwegianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangPolishModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangPortugueseModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangRomanianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangRussianModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangSerbianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangSlovakModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangSloveneModel.cpp"), - os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangSpanishModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangThaiModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangTurkishModel.cpp"), + os.path.join(uchardet_dir, "LangModels/LangUkrainianModel.cpp"), os.path.join(uchardet_dir, "LangModels/LangVietnameseModel.cpp"), - os.path.join(uchardet_dir, "nsHebrewProber.cpp"), - os.path.join(uchardet_dir, "nsCharSetProber.cpp"), + os.path.join(uchardet_dir, "CharDistribution.cpp"), + os.path.join(uchardet_dir, "JpCntx.cpp"), os.path.join(uchardet_dir, "nsBig5Prober.cpp"), + os.path.join(uchardet_dir, "nsCharSetProber.cpp"), + os.path.join(uchardet_dir, "nsCJKDetector.cpp"), + os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"), + os.path.join(uchardet_dir, "nsEscSM.cpp"), os.path.join(uchardet_dir, "nsEUCJPProber.cpp"), os.path.join(uchardet_dir, "nsEUCKRProber.cpp"), os.path.join(uchardet_dir, "nsEUCTWProber.cpp"), - os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"), - os.path.join(uchardet_dir, "nsEscSM.cpp"), os.path.join(uchardet_dir, "nsGB2312Prober.cpp"), + os.path.join(uchardet_dir, "nsHebrewProber.cpp"), + os.path.join(uchardet_dir, "nsJohabProber.cpp"), + os.path.join(uchardet_dir, "nsLanguageDetector.cpp"), + os.path.join(uchardet_dir, "nsLatin1Prober.cpp"), os.path.join(uchardet_dir, "nsMBCSGroupProber.cpp"), os.path.join(uchardet_dir, "nsMBCSSM.cpp"), - os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"), os.path.join(uchardet_dir, "nsSBCharSetProber.cpp"), + os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"), os.path.join(uchardet_dir, "nsSJISProber.cpp"), - os.path.join(uchardet_dir, "nsUTF8Prober.cpp"), - os.path.join(uchardet_dir, "nsLatin1Prober.cpp"), os.path.join(uchardet_dir, "nsUniversalDetector.cpp"), + os.path.join(uchardet_dir, "nsUTF8Prober.cpp"), os.path.join(uchardet_dir, "uchardet.cpp"), ] sources += uchardet_sources diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py index aa32b3e..f616d7f 100644 --- a/src/cchardet/__init__.py +++ b/src/cchardet/__init__.py @@ -1,7 +1,7 @@ from . import _cchardet -version = (2, 2, 0, "alpha", 2) -__version__ = "2.2.0a2" +version = (2, 2, 0, "alpha", 3) +__version__ = "2.2.0a3" def detect(msg): @@ -17,6 +17,10 @@ def detect(msg): encoding, confidence = _cchardet.detect_with_confidence(msg) if isinstance(encoding, bytes): encoding = encoding.decode() + + if encoding == "MAC-CENTRALEUROPE": + encoding = "maccentraleurope" + return {"encoding": encoding, "confidence": confidence} diff --git a/src/cchardet/_cchardet.pyx b/src/cchardet/_cchardet.pyx index dcca080..27d9f55 100644 --- a/src/cchardet/_cchardet.pyx +++ b/src/cchardet/_cchardet.pyx @@ -3,20 +3,24 @@ cdef extern from *: ctypedef char* const_char_ptr "const char*" + ctypedef unsigned long size_t +# uchardet v0.0.8 cdef extern from "uchardet.h": ctypedef void* uchardet_t cdef uchardet_t uchardet_new() cdef void uchardet_delete(uchardet_t ud) - cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, int length) + cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, size_t length) cdef void uchardet_data_end(uchardet_t ud) cdef void uchardet_reset(uchardet_t ud) cdef const_char_ptr uchardet_get_charset(uchardet_t ud) - cdef float uchardet_get_confidence(uchardet_t ud) + cdef float uchardet_get_confidence(uchardet_t ud, size_t i) + # cdef const_char_ptr uchardet_get_encoding(uchardet_t ud, size_t i) + # cdef const_char_ptr uchardet_get_language(uchardet_t ud, size_t i) def detect_with_confidence(bytes msg): - cdef int length = len(msg) - + cdef size_t length = len(msg) + cdef uchardet_t ud = uchardet_new() cdef int result = uchardet_handle_data(ud, msg, length) @@ -26,8 +30,17 @@ def detect_with_confidence(bytes msg): uchardet_data_end(ud) - cdef bytes detected_charset = uchardet_get_charset(ud) - cdef float detected_confidence = uchardet_get_confidence(ud) + cdef bytes detected_charset + # cdef bytes detected_encoding + # cdef const_char_ptr detected_language + cdef float detected_confidence + + detected_charset = uchardet_get_charset(ud) + # detected_encoding = uchardet_get_encoding(ud, 0) + # detected_language = uchardet_get_language(ud, 0) + detected_confidence = uchardet_get_confidence(ud, 0) + + uchardet_reset(ud) uchardet_delete(ud) if detected_charset: @@ -40,6 +53,8 @@ cdef class UniversalDetector: cdef int _done cdef int _closed cdef bytes _detected_charset + # cdef bytes _detected_encoding + # cdef const_char_ptr _detected_language cdef float _detected_confidence def __init__(self): @@ -47,6 +62,8 @@ cdef class UniversalDetector: self._done = 0 self._closed = 0 self._detected_charset = b"" + # self._detected_encoding = b"" + # self._detected_language = b"" self._detected_confidence = 0.0 def reset(self): @@ -54,6 +71,8 @@ cdef class UniversalDetector: self._done = 0 self._closed = 0 self._detected_charset = b"" + # self._detected_encoding = b"" + # self._detected_language = b"" self._detected_confidence = 0.0 uchardet_reset(self._ud) @@ -76,13 +95,18 @@ cdef class UniversalDetector: self._done = 1 self._detected_charset = uchardet_get_charset(self._ud) - self._detected_confidence = uchardet_get_confidence(self._ud) + # self._detected_encoding = uchardet_get_encoding(self._ud, 0) + # self._detected_language = uchardet_get_language(self._ud, 0) + self._detected_confidence = uchardet_get_confidence(self._ud, 0) def close(self): if not self._closed: uchardet_data_end(self._ud) + self._detected_charset = uchardet_get_charset(self._ud) - self._detected_confidence = uchardet_get_confidence(self._ud) + # self._detected_encoding = uchardet_get_encoding(self._ud, 0) + # self._detected_language = uchardet_get_language(self._ud, 0) + self._detected_confidence = uchardet_get_confidence(self._ud, 0) uchardet_delete(self._ud) self._closed = 1