From af1bf1d93da042184dd1bd38a15d83501f8aa4f3 Mon Sep 17 00:00:00 2001
From: PyYoshi <myoshi321go@gmail.com>
Date: Tue, 11 Jun 2024 19:45:18 +0900
Subject: [PATCH] =?UTF-8?q?feat:=20uchardet=E3=81=AE=E6=9C=80=E6=96=B0?=
 =?UTF-8?q?=E7=89=88=E3=81=AB=E8=BF=BD=E5=BE=93=E3=81=99=E3=82=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 setup.py                   | 38 +++++++++++++++++++++++-------------
 src/cchardet/__init__.py   |  8 ++++++--
 src/cchardet/_cchardet.pyx | 40 ++++++++++++++++++++++++++++++--------
 3 files changed, 63 insertions(+), 23 deletions(-)

diff --git a/setup.py b/setup.py
index a24ac72..964f0be 100644
--- a/setup.py
+++ b/setup.py
@@ -12,54 +12,66 @@
 sources = cchardet_sources
 
 uchardet_sources = [
-    os.path.join(uchardet_dir, "CharDistribution.cpp"),
-    os.path.join(uchardet_dir, "JpCntx.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangArabicModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangBelarusianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangBulgarianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangCatalanModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangCroatianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangCzechModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangEnglishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangEsperantoModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangEstonianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangFinnishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangFrenchModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangDanishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangGeorgianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangGermanModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangGreekModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangHebrewModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangHindiModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangHungarianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangIrishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangItalianModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangLatvianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangLithuanianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangMacedonianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangMalteseModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangNorwegianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangPolishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangPortugueseModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangRomanianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangRussianModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangSerbianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangSlovakModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangSloveneModel.cpp"),
-    os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangSpanishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangSwedishModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangThaiModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangTurkishModel.cpp"),
+    os.path.join(uchardet_dir, "LangModels/LangUkrainianModel.cpp"),
     os.path.join(uchardet_dir, "LangModels/LangVietnameseModel.cpp"),
-    os.path.join(uchardet_dir, "nsHebrewProber.cpp"),
-    os.path.join(uchardet_dir, "nsCharSetProber.cpp"),
+    os.path.join(uchardet_dir, "CharDistribution.cpp"),
+    os.path.join(uchardet_dir, "JpCntx.cpp"),
     os.path.join(uchardet_dir, "nsBig5Prober.cpp"),
+    os.path.join(uchardet_dir, "nsCharSetProber.cpp"),
+    os.path.join(uchardet_dir, "nsCJKDetector.cpp"),
+    os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"),
+    os.path.join(uchardet_dir, "nsEscSM.cpp"),
     os.path.join(uchardet_dir, "nsEUCJPProber.cpp"),
     os.path.join(uchardet_dir, "nsEUCKRProber.cpp"),
     os.path.join(uchardet_dir, "nsEUCTWProber.cpp"),
-    os.path.join(uchardet_dir, "nsEscCharsetProber.cpp"),
-    os.path.join(uchardet_dir, "nsEscSM.cpp"),
     os.path.join(uchardet_dir, "nsGB2312Prober.cpp"),
+    os.path.join(uchardet_dir, "nsHebrewProber.cpp"),
+    os.path.join(uchardet_dir, "nsJohabProber.cpp"),
+    os.path.join(uchardet_dir, "nsLanguageDetector.cpp"),
+    os.path.join(uchardet_dir, "nsLatin1Prober.cpp"),
     os.path.join(uchardet_dir, "nsMBCSGroupProber.cpp"),
     os.path.join(uchardet_dir, "nsMBCSSM.cpp"),
-    os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"),
     os.path.join(uchardet_dir, "nsSBCharSetProber.cpp"),
+    os.path.join(uchardet_dir, "nsSBCSGroupProber.cpp"),
     os.path.join(uchardet_dir, "nsSJISProber.cpp"),
-    os.path.join(uchardet_dir, "nsUTF8Prober.cpp"),
-    os.path.join(uchardet_dir, "nsLatin1Prober.cpp"),
     os.path.join(uchardet_dir, "nsUniversalDetector.cpp"),
+    os.path.join(uchardet_dir, "nsUTF8Prober.cpp"),
     os.path.join(uchardet_dir, "uchardet.cpp"),
 ]
 sources += uchardet_sources
diff --git a/src/cchardet/__init__.py b/src/cchardet/__init__.py
index aa32b3e..f616d7f 100644
--- a/src/cchardet/__init__.py
+++ b/src/cchardet/__init__.py
@@ -1,7 +1,7 @@
 from . import _cchardet
 
-version = (2, 2, 0, "alpha", 2)
-__version__ = "2.2.0a2"
+version = (2, 2, 0, "alpha", 3)
+__version__ = "2.2.0a3"
 
 
 def detect(msg):
@@ -17,6 +17,10 @@ def detect(msg):
     encoding, confidence = _cchardet.detect_with_confidence(msg)
     if isinstance(encoding, bytes):
         encoding = encoding.decode()
+
+    if encoding == "MAC-CENTRALEUROPE":
+        encoding = "maccentraleurope"
+
     return {"encoding": encoding, "confidence": confidence}
 
 
diff --git a/src/cchardet/_cchardet.pyx b/src/cchardet/_cchardet.pyx
index dcca080..27d9f55 100644
--- a/src/cchardet/_cchardet.pyx
+++ b/src/cchardet/_cchardet.pyx
@@ -3,20 +3,24 @@
 
 cdef extern from *:
     ctypedef char* const_char_ptr "const char*"
+    ctypedef unsigned long size_t
 
+# uchardet v0.0.8
 cdef extern from "uchardet.h":
     ctypedef void* uchardet_t
     cdef uchardet_t uchardet_new()
     cdef void uchardet_delete(uchardet_t ud)
-    cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, int length)
+    cdef int uchardet_handle_data(uchardet_t ud, const_char_ptr data, size_t length)
     cdef void uchardet_data_end(uchardet_t ud)
     cdef void uchardet_reset(uchardet_t ud)
     cdef const_char_ptr uchardet_get_charset(uchardet_t ud)
-    cdef float uchardet_get_confidence(uchardet_t ud)
+    cdef float uchardet_get_confidence(uchardet_t ud, size_t i)
+    # cdef const_char_ptr uchardet_get_encoding(uchardet_t ud, size_t i)
+    # cdef const_char_ptr uchardet_get_language(uchardet_t ud, size_t i)
 
 def detect_with_confidence(bytes msg):
-    cdef int length = len(msg)
-    
+    cdef size_t length = len(msg)
+
     cdef uchardet_t ud = uchardet_new()
 
     cdef int result = uchardet_handle_data(ud, msg, length)
@@ -26,8 +30,17 @@ def detect_with_confidence(bytes msg):
 
     uchardet_data_end(ud)
 
-    cdef bytes detected_charset = uchardet_get_charset(ud)
-    cdef float detected_confidence = uchardet_get_confidence(ud)
+    cdef bytes detected_charset
+    # cdef bytes detected_encoding
+    # cdef const_char_ptr detected_language
+    cdef float detected_confidence
+
+    detected_charset = uchardet_get_charset(ud)
+    # detected_encoding = uchardet_get_encoding(ud, 0)
+    # detected_language = uchardet_get_language(ud, 0)
+    detected_confidence = uchardet_get_confidence(ud, 0)
+
+    uchardet_reset(ud)
     uchardet_delete(ud)
 
     if detected_charset:
@@ -40,6 +53,8 @@ cdef class UniversalDetector:
     cdef int _done
     cdef int _closed
     cdef bytes _detected_charset
+    # cdef bytes _detected_encoding
+    # cdef const_char_ptr _detected_language
     cdef float _detected_confidence
 
     def __init__(self):
@@ -47,6 +62,8 @@ cdef class UniversalDetector:
         self._done = 0
         self._closed = 0
         self._detected_charset = b""
+        # self._detected_encoding = b""
+        # self._detected_language = b""
         self._detected_confidence = 0.0
 
     def reset(self):
@@ -54,6 +71,8 @@ cdef class UniversalDetector:
             self._done = 0
             self._closed = 0
             self._detected_charset = b""
+            # self._detected_encoding = b""
+            # self._detected_language = b""
             self._detected_confidence = 0.0
             uchardet_reset(self._ud)
 
@@ -76,13 +95,18 @@ cdef class UniversalDetector:
                 self._done = 1
 
             self._detected_charset = uchardet_get_charset(self._ud)
-            self._detected_confidence = uchardet_get_confidence(self._ud)
+            # self._detected_encoding = uchardet_get_encoding(self._ud, 0)
+            # self._detected_language = uchardet_get_language(self._ud, 0)
+            self._detected_confidence = uchardet_get_confidence(self._ud, 0)
 
     def close(self):
         if not self._closed:
             uchardet_data_end(self._ud)
+
             self._detected_charset = uchardet_get_charset(self._ud)
-            self._detected_confidence = uchardet_get_confidence(self._ud)
+            # self._detected_encoding = uchardet_get_encoding(self._ud, 0)
+            # self._detected_language = uchardet_get_language(self._ud, 0)
+            self._detected_confidence = uchardet_get_confidence(self._ud, 0)
 
             uchardet_delete(self._ud)
             self._closed = 1