feat: uchardetの最新版に追従する

PyYoshi · Jun 11, 2024 · af1bf1d · af1bf1d
1 parent 502170a
commit af1bf1d
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 23 deletions.
diff --git a/setup.py b/setup.py
@@ -12,54 +12,66 @@
 sources = cchardet_sources
 
 uchardet_sources = [
-    os.path.join(uchardet_dir, "CharDistribution.cpp"),
-    os.path.join(uchardet_dir, "JpCntx.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangArabicModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangBelarusianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangBulgarianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangCatalanModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangCroatianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangCzechModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangEnglishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangEsperantoModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangEstonianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangFinnishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangFrenchModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangGeorgianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangGermanModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangGreekModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangHebrewModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangHindiModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangIrishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangItalianModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangLatvianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangMacedonianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangMalteseModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangNorwegianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangPolishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangPortugueseModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangRomanianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangRussianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangSerbianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangSlovakModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangSloveneModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangSpanishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangThaiModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangTurkishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangUkrainianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangVietnameseModel.cpp"),
-    os.path.join(uchardet_dir, "nsHebrewProber.cpp"),
-    os.path.join(uchardet_dir, "nsCharSetProber.cpp"),
+    os.path.join(uchardet_dir, "CharDistribution.cpp"),
+    os.path.join(uchardet_dir, "JpCntx.cpp"),
     os.path.join(uchardet_dir, "nsBig5Prober.cpp"),
+    os.path.join(uchardet_dir, "nsCharSetProber.cpp"),
+    os.path.join(uchardet_dir, "nsCJKDetector.cpp"),
+    os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"),
+    os.path.join(uchardet_dir, "nsEscSM.cpp"),
     os.path.join(uchardet_dir, "nsEUCJPProber.cpp"),
     os.path.join(uchardet_dir, "nsEUCKRProber.cpp"),
     os.path.join(uchardet_dir, "nsEUCTWProber.cpp"),
-    os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"),
-    os.path.join(uchardet_dir, "nsEscSM.cpp"),
     os.path.join(uchardet_dir, "nsGB2312Prober.cpp"),
+    os.path.join(uchardet_dir, "nsHebrewProber.cpp"),
+    os.path.join(uchardet_dir, "nsJohabProber.cpp"),
+    os.path.join(uchardet_dir, "nsLanguageDetector.cpp"),
+    os.path.join(uchardet_dir, "nsLatin1Prober.cpp"),
     os.path.join(uchardet_dir, "nsMBCSGroupProber.cpp"),
     os.path.join(uchardet_dir, "nsMBCSSM.cpp"),
-    os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"),
     os.path.join(uchardet_dir, "nsSBCharSetProber.cpp"),
+    os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"),
     os.path.join(uchardet_dir, "nsSJISProber.cpp"),
-    os.path.join(uchardet_dir, "nsUTF8Prober.cpp"),
-    os.path.join(uchardet_dir, "nsLatin1Prober.cpp"),
     os.path.join(uchardet_dir, "nsUniversalDetector.cpp"),
+    os.path.join(uchardet_dir, "nsUTF8Prober.cpp"),
     os.path.join(uchardet_dir, "uchardet.cpp"),
 ]
 sources += uchardet_sources

diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py
@@ -1,7 +1,7 @@
 from . import _cchardet
 
-version = (2, 2, 0, "alpha", 2)
-__version__ = "2.2.0a2"
+version = (2, 2, 0, "alpha", 3)
+__version__ = "2.2.0a3"
 
 
 def detect(msg):
@@ -17,6 +17,10 @@ def detect(msg):
     encoding, confidence = _cchardet.detect_with_confidence(msg)
     if isinstance(encoding, bytes):
         encoding = encoding.decode()
+
+    if encoding == "MAC-CENTRALEUROPE":
+        encoding = "maccentraleurope"
+
     return {"encoding": encoding, "confidence": confidence}
 
 

diff --git a/src/cchardet/_cchardet.pyx b/src/cchardet/_cchardet.pyx
@@ -3,20 +3,24 @@
 
 cdef extern from *:
     ctypedef char* const_char_ptr "const char*"
+    ctypedef unsigned long size_t
 
+# uchardet v0.0.8
 cdef extern from "uchardet.h":
     ctypedef void* uchardet_t
     cdef uchardet_t uchardet_new()
     cdef void uchardet_delete(uchardet_t ud)
-    cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, int length)
+    cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, size_t length)
     cdef void uchardet_data_end(uchardet_t ud)
     cdef void uchardet_reset(uchardet_t ud)
     cdef const_char_ptr uchardet_get_charset(uchardet_t ud)
-    cdef float uchardet_get_confidence(uchardet_t ud)
+    cdef float uchardet_get_confidence(uchardet_t ud, size_t i)
+    # cdef const_char_ptr uchardet_get_encoding(uchardet_t ud, size_t i)
+    # cdef const_char_ptr uchardet_get_language(uchardet_t ud, size_t i)
 
 def detect_with_confidence(bytes msg):
-    cdef int length = len(msg)
-    
+    cdef size_t length = len(msg)
+
     cdef uchardet_t ud = uchardet_new()
 
     cdef int result = uchardet_handle_data(ud, msg, length)
@@ -26,8 +30,17 @@ def detect_with_confidence(bytes msg):
 
     uchardet_data_end(ud)
 
-    cdef bytes detected_charset = uchardet_get_charset(ud)
-    cdef float detected_confidence = uchardet_get_confidence(ud)
+    cdef bytes detected_charset
+    # cdef bytes detected_encoding
+    # cdef const_char_ptr detected_language
+    cdef float detected_confidence
+
+    detected_charset = uchardet_get_charset(ud)
+    # detected_encoding = uchardet_get_encoding(ud, 0)
+    # detected_language = uchardet_get_language(ud, 0)
+    detected_confidence = uchardet_get_confidence(ud, 0)
+
+    uchardet_reset(ud)
     uchardet_delete(ud)
 
     if detected_charset:
@@ -40,20 +53,26 @@ cdef class UniversalDetector:
     cdef int _done
     cdef int _closed
     cdef bytes _detected_charset
+    # cdef bytes _detected_encoding
+    # cdef const_char_ptr _detected_language
     cdef float _detected_confidence
 
     def __init__(self):
         self._ud = uchardet_new()
         self._done = 0
         self._closed = 0
         self._detected_charset = b""
+        # self._detected_encoding = b""
+        # self._detected_language = b""
         self._detected_confidence = 0.0
 
     def reset(self):
         if not self._closed:
             self._done = 0
             self._closed = 0
             self._detected_charset = b""
+            # self._detected_encoding = b""
+            # self._detected_language = b""
             self._detected_confidence = 0.0
             uchardet_reset(self._ud)
 
@@ -76,13 +95,18 @@ cdef class UniversalDetector:
                 self._done = 1
 
             self._detected_charset = uchardet_get_charset(self._ud)
-            self._detected_confidence = uchardet_get_confidence(self._ud)
+            # self._detected_encoding = uchardet_get_encoding(self._ud, 0)
+            # self._detected_language = uchardet_get_language(self._ud, 0)
+            self._detected_confidence = uchardet_get_confidence(self._ud, 0)
 
     def close(self):
         if not self._closed:
             uchardet_data_end(self._ud)
+
             self._detected_charset = uchardet_get_charset(self._ud)
-            self._detected_confidence = uchardet_get_confidence(self._ud)
+            # self._detected_encoding = uchardet_get_encoding(self._ud, 0)
+            # self._detected_language = uchardet_get_language(self._ud, 0)
+            self._detected_confidence = uchardet_get_confidence(self._ud, 0)
 
             uchardet_delete(self._ud)
             self._closed = 1