TLDR-693 other languages support (#458)

ispras · Jun 18, 2024 · 5102112 · 5102112
1 parent 6f270e1
commit 5102112
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 12 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,7 @@
 ARG REPOSITORY="docker.io"
 FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
+ARG LANGUAGES=""
+RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done
 
 ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
 ENV RESOURCES_PATH "/dedoc_root/resources"

diff --git a/dedoc/api/api_args.py b/dedoc/api/api_args.py
@@ -28,7 +28,7 @@ class QueryParameters:
     # pdf handling
     pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
                                     description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
-    language: str = Form("rus+eng", enum=["rus+eng", "rus", "eng", "fra", "spa"], description="Recognition language")
+    language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
     pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
     is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],
                                        description='One or multiple column document, "auto" - predict number of page columns automatically')

diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html
@@ -129,14 +129,15 @@ <h4>PDF handling</h4>
                 </p>
 
                 <p>
-                    <label>
-                        <select name="language">
+                    <label> language
+                        <input name="language" list="language" size="8" placeholder="rus+eng">
+                        <datalist id="language">
+                            <option value="rus+eng" selected>rus+eng</option>
                             <option value="rus">rus</option>
                             <option value="eng">eng</option>
-                            <option value="rus+eng" selected>rus+eng</option>
                             <option value="fra">fra</option>
                             <option value="spa">spa</option>
-                        </select> language
+                        </datalist>
                     </label>
                 </p>
 

diff --git a/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py b/dedoc/structure_extractors/concrete_structure_extractors/default_structure_extractor.py
@@ -15,12 +15,12 @@ class DefaultStructureExtractor(AbstractStructureExtractor):
     from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix
     from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix
     from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix
-    from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix
+    from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix
     from dedoc.structure_extractors.feature_extractors.list_features.prefix.prefix import LinePrefix
 
     document_type = "other"
 
-    prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, LetterPrefix, BulletPrefix]
+    prefix_list: List[LinePrefix] = [DottedPrefix, BracketPrefix, AnyLetterPrefix, BulletPrefix]
 
     def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
         """
@@ -66,7 +66,7 @@ def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWit
         from dedoc.structure_extractors.feature_extractors.list_features.prefix.bracket_prefix import BracketPrefix
         from dedoc.structure_extractors.feature_extractors.list_features.prefix.bullet_prefix import BulletPrefix
         from dedoc.structure_extractors.feature_extractors.list_features.prefix.dotted_prefix import DottedPrefix
-        from dedoc.structure_extractors.feature_extractors.list_features.prefix.letter_prefix import LetterPrefix
+        from dedoc.structure_extractors.feature_extractors.list_features.prefix.any_letter_prefix import AnyLetterPrefix
 
         prefix = get_prefix(DefaultStructureExtractor.prefix_list, line)
 
@@ -83,7 +83,7 @@ def get_hl_list_using_regexp(line: LineWithMeta, previous_line: Optional[LineWit
                 return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item)  # here is russian and english letters
             return HierarchyLevel(3, 1, False, line_type=HierarchyLevel.list_item)
 
-        if prefix.name == LetterPrefix.name:  # list like a)
+        if prefix.name == AnyLetterPrefix.name:  # list like a)
             return HierarchyLevel(4, 1, False, line_type=HierarchyLevel.list_item)
 
         if prefix.name == BulletPrefix.name:  # bullet list

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,7 +5,6 @@ services:
     mem_limit: 16G
     build:
       context: .
-      dockerfile: Dockerfile
     restart: always
     tty: true
     ports:
@@ -22,7 +21,6 @@ services:
       - dedoc
     build:
       context: .
-      dockerfile: Dockerfile
     tty: true
     environment:
       DOC_READER_HOST: "dedoc"

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
@@ -160,7 +160,7 @@ Below are the instructions for installing the package ``virtualenvwrapper``:
 
 
 Install trusted torch (verified version)
-----------------------------------------------
+----------------------------------------
 
 You can install a trusted library ``torch`` (as a verified version of the library, verified by tools developed by the Ivannikov Institute for System Programming of the Russian Academy of Sciences).
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -230,6 +230,7 @@ For a document of unknown or unsupported domain there is an option to use defaul
    tutorials/add_new_doc_format
    tutorials/add_new_structure_type
    tutorials/creating_document_classes
+   tutorials/add_new_language
 
 
 .. toctree::

diff --git a/docs/source/tutorials/add_new_language.rst b/docs/source/tutorials/add_new_language.rst
@@ -0,0 +1,114 @@
+.. _add_language:
+
+Adding support for a new language to Dedoc
+==========================================
+
+By default, dedoc supports handling Russian and English languages.
+The most important part of language support is OCR (for images, PDF).
+If you don't need parse images and PDF files, you don't need to do anything.
+
+To parse images with a new language, additional Tesseract language packages should be installed.
+The list of languages supported by Tesseract are enlisted `here <https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html>`_ (see **Languages** section).
+
+.. seealso::
+    The instruction with Tesseract installation can be found :ref:`here <install_tesseract>`.
+
+.. warning::
+    Not all languages are fully supported by dedoc even with installed Tesseract packages. The more detailed information will appear soon.
+
+
+Add new language in docker
+--------------------------
+
+Similar to the :ref:`installation tutorial <dedoc_installation>`, beforehand one should clone the dedoc repository and go to the `dedoc` directory:
+
+.. code-block:: bash
+
+    git clone https://github.com/ispras/dedoc
+    cd dedoc
+
+Then one should decide, which languages should be supported, and look for them in the
+`list of supported languages <https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html>`_ (**Languages** section).
+For each language, ``LangCode`` is used to configure it.
+For example, if we need to add French and Spanish, we should use ``fra`` and ``spa`` language codes.
+
+
+Using docker build
+******************
+
+For passing the list of languages while building docker image, the ``LANGUAGES`` argument is used.
+Languages should be enlisted in string and separated by spaces.
+For example, for adding French and Spanish we should use the following command:
+
+.. code-block:: bash
+
+    docker build --build-arg LANGUAGES="fra spa" .
+
+One may also choose a tag for an image, e.g. ``dedocproject/dedoc_multilang:latest``, and run the container:
+
+.. code-block:: bash
+
+    docker build -t dedocproject/dedoc_multilang:latest --build-arg LANGUAGES="fra spa" .
+    docker run -p 1231:1231 --rm dedocproject/dedoc_multilang python3 /dedoc_root/dedoc/main.py
+
+
+Using docker-compose
+********************
+
+For passing the list of languages while building docker image, the ``LANGUAGES`` argument is used in the ``docker-compose.yml`` file.
+Languages should be enlisted in string and separated by spaces.
+For example, for adding French and Spanish we should add the following lines to the ``docker-compose.yml`` file:
+
+.. code-block:: yaml
+    :emphasize-lines: 8-9
+
+    version: '2.4'
+
+    services:
+      dedoc:
+        mem_limit: 16G
+        build:
+          context: .
+          args:
+            LANGUAGES: "fra spa"
+        restart: always
+        tty: true
+        ports:
+          - 1231:1231
+        environment:
+          DOCREADER_PORT: 1231
+          GROBID_HOST: "grobid"
+          GROBID_PORT: 8070
+
+Then, the service can be run with the following command:
+
+.. code-block:: bash
+
+    docker-compose up --build
+
+
+Add new language locally
+------------------------
+
+Suppose Tesseract OCR 5 is already installed on the computer (or see :ref:`instruction <install_tesseract>`).
+For each language, the following command should be executed (``lang`` is one language code):
+
+.. code-block:: bash
+
+    apt install -y tesseract-ocr-$lang
+
+For example, for adding French and Spanish we should use the following commands:
+
+.. code-block:: bash
+
+    apt install -y tesseract-ocr-fra
+    apt install -y tesseract-ocr-spa
+
+Or we can install all packages with one command using ``LANGUAGES`` variable:
+
+.. code-block:: bash
+
+    export LANGUAGES="fra spa"
+    for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done
+
+Then the dedoc library can be used with new languages or dedoc API can be run locally (see :ref:`instruction <install_library_via_pip>`) for more details.
-Original file line number
+Diff line change
@@ Expand Up @@
     Install trusted torch (verified version)
-    ----------------------------------------------
+    ----------------------------------------
     You can install a trusted library ``torch`` (as a verified version of the library, verified by tools developed by the Ivannikov Institute for System Programming of the Russian Academy of Sciences).
@@ Expand Down @@