Fixed dockerfile, tested tox for different env and openmmlab AB#1006

AI-Riksarkivet · Oct 14, 2024 · cb933c5 · cb933c5
1 parent 1d2d391
commit cb933c5
Show file tree

Hide file tree

Showing 13 changed files with 373 additions and 127 deletions.
diff --git a/.azure-pipelines/tox.yaml b/.azure-pipelines/tox.yaml
diff --git a/Makefile b/Makefile
@@ -0,0 +1,11 @@
+docker-build: ## make docker-build SERVICE=htrflow TAG=v0.1.0
+	@docker build -t airiksarkivet/$(SERVICE):$(if $(TAG),$(TAG),latest) -f docker/$(SERVICE).dockerfile .
+
+docker-tag: ## make docker-tag SERVICE=htrflow TAG=v0.1.0 REGISTRY=registry.ra.se:5002
+	@docker tag airiksarkivet/$(SERVICE):$(if $(TAG),$(TAG),latest) $(REGISTRY)/airiksarkivet/$(SERVICE):$(if $(TAG),$(TAG),latest)
+
+docker-push: ## make docker-push SERVICE=htrflow TAG=v0.1.0 REGISTRY=registry.ra.se:5002
+	@docker push $(REGISTRY)/airiksarkivet/$(SERVICE):$(if $(TAG),$(TAG),latest)
+
+docker-release: docker-build docker-tag docker-push ## make docker-release SERVICE=htrflow TAG=v0.1.0 REGISTRY=registry.ra.se:5002
+	@echo "Docker image built, tagged, and pushed successfully!"
diff --git a/docker/htrflow_openmmlab.dockerfile b/docker/htrflow_openmmlab.dockerfile
@@ -0,0 +1,23 @@
+FROM huggingface/transformers-pytorch-gpu:4.41.2
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /bin/
+
+
+WORKDIR /app
+
+RUN uv venv --python 3.10.14
+
+
+ADD uv.lock /app/uv.lock
+ADD pyproject.toml /app/pyproject.toml
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen --no-install-project
+
+COPY src LICENSE README.md examples /app/
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv sync --frozen 
+
+RUN uv pip install -U https://github.com/Swedish-National-Archives-AI-lab/openmim_install/raw/main/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl && \
+    uv pip install -U mmdet==3.1.0 mmengine==0.7.2 mmocr==1.0.1 yapf==0.40.1
+
+ENV PATH="/app/.venv/bin:$PATH"
diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
@@ -13,6 +13,11 @@ Install HTRflow with [pip](https://pypi.org/project/htrflow):
 pip install htrflow
 ```
 
+Requirements:
+
+- Python >=3.10 and <3.13 (Python 3.10 is required for OpenMMLab)
+- With GPU: CUDA >=11.8 (required due to PyTorch 2.0, can still run on CPU)
+
 Verify the installation of HTRflow with `htrflow --help`. If the installation was successful, the following message is shown:
 
 <!-- termynal -->
@@ -53,12 +58,11 @@ Requirements:
 
 - [uv](https://docs.astral.sh/uv/) or pip
 - Python 3.10
-- With GPU: CUDA >=11.8 (can still run on CPU)
+- With GPU: CUDA >=11.8 (required due to PyTorch 2.0, can still run on CPU)
 
 Clone this repository and run:
 ```sh
-uv pip install -e .
-
+uv pip install -e .  # or you can run: uv sync
 ```
 This will install the HTRflow package in a virtual environment.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,18 +18,19 @@ requires-python = ">=3.10"
 
 dependencies = [
     "jinja2 >= 3.1.3",
-    "numpy>= 2.1.2",
+    "numpy",
     "opencv-python >=4.6.0",
     "tqdm >=4.66.2,<5",
     "xmlschema >=3.0.2,<4",
     "typer>=0.12.0",
     "rich >=13.7.1",
     "jiwer >=3.0.4",
-    "pandas >=2.2.2",
+    "pandas",
     "pagexml-tools >=0.5.0",
     "transformers[torch] >=4.44.1",
     "huggingface-hub[cli] >=0.24.6",
     "ultralytics >=8.0.225",
+    "pydantic>=2.9.2",
 ]
 
 
@@ -83,7 +84,7 @@ markers = [
     "gpu: marks tests as gpu (deselect with '-m \"not gpu\"')",
 ]
 pythonpath = "src"
-testpaths = ["tests/unit"]
+testpaths = ["tests/unit", "tests/integration"]
 
 [tool.coverage.run]
 source = ["src/htrflow"]
@@ -133,11 +134,3 @@ warn_redundant_casts = true
 warn_return_any = true
 warn_unreachable = true
 warn_unused_configs = true
-
-[tool.tox]
-env_list = ["3.12", "3.11","3.10"]
-
-[tool.tox.env_run_base]
-runner = "uv-venv-lock-runner"
-description = "Run tests using uv with locked dependencies"
-commands = [["pytest"]]
diff --git a/tests/integration/.gitkeep b/tests/integration/.gitkeep
diff --git a/tests/integration/Dockerfile b/tests/integration/Dockerfile
@@ -0,0 +1,14 @@
+# Dockerfile
+FROM python:3.11
+
+# Install Hera and Pydantic
+RUN pip install hera-workflows pydantic
+
+# Set the working directory
+WORKDIR /app
+
+# Copy only the test_hera.py file into the /app directory inside the container
+COPY test_hera.py /app/test_hera.py
+
+# Make sure /app is on the Python path
+ENV PYTHONPATH=/app
diff --git a/tests/integration/data/test_gpu_htr_model_pipeline.yaml b/tests/integration/data/test_gpu_htr_model_pipeline.yaml
@@ -0,0 +1,21 @@
+# HTR pipeline
+
+steps:
+
+- step: TextRecognition
+  settings:
+    model: Satrn 
+    model_settings:
+       model: Riksarkivet/satrn_htr
+    generation_settings:
+       batch_size: 1
+       num_beams: 1
+
+- step: TextRecognition
+  settings:
+    model: TrOCR 
+    model_settings:
+       model: Riksarkivet/trocr-base-handwritten-hist-swe-2
+    generation_settings:
+       batch_size: 1
+       num_beams: 1
diff --git a/tests/integration/data/trocr_example.png b/tests/integration/data/trocr_example.png
diff --git a/tests/integration/test_gpu_availability.py b/tests/integration/test_gpu_availability.py
@@ -0,0 +1,33 @@
+import pytest
+import torch
+from hera.workflows import DAG, Task, Workflow, script, WorkflowsService
+
+
+SELECTOR_ARGO_SERVER_URL = "http://localhost:2746"
+SELECTOR_SERVICE_ACCOUNT = "htrflow-service-account"
+
+
+@pytest.mark.gpu
+def test_gpu_availability():
+    assert torch.cuda.is_available(), "CUDA GPU is not available"
+    print(f"CUDA available: {torch.cuda.is_available()}")
+    print(f"Number of GPUs: {torch.cuda.device_count()}")
+
+
+@script(image="python:3.12")
+def echo(message):
+    print(message)
+
+
+with Workflow(
+    generate_name="dag-diamond-",
+    service_account_name=SELECTOR_SERVICE_ACCOUNT,
+    workflows_service=WorkflowsService(host=SELECTOR_ARGO_SERVER_URL),
+    entrypoint="diamond",
+) as w:
+    with DAG(name="diamond"):
+        A = Task(name="A", template=echo, arguments={"message": "A"})
+        B = Task(name="B", template=echo, arguments={"message": "B"})
+        A >> B
+
+w.submit()
diff --git a/tests/integration/test_gpu_htr_model_pipeline.py b/tests/integration/test_gpu_htr_model_pipeline.py
@@ -0,0 +1,37 @@
+import os
+import pytest
+from typer.testing import CliRunner
+from htrflow.cli import app
+
+runner = CliRunner()
+
+image_path = "tests/integration/data/trocr_example.png"
+pipeline_path = "tests/integration/data/test_gpu_htr_model_pipeline.yaml"
+
+
+@pytest.fixture(scope="module")
+def check_test_files():
+    assert os.path.exists(image_path), f"Test image not found: {image_path}"
+    assert os.path.exists(
+        pipeline_path
+    ), f"Test pipeline YAML not found: {pipeline_path}"
+
+
+@pytest.mark.gpu
+def test_run_htr_pipeline(check_test_files):
+    result = runner.invoke(
+        app,
+        [
+            "pipeline",
+            pipeline_path,
+            image_path,
+            "--batch-output",
+            "1",
+            "--logfile",
+            "tox-test.log",
+        ],
+    )
+
+    assert (
+        result.exit_code == 0
+    ), f"Pipeline returns sucessfully exit code {result.exit_code}"
diff --git a/tox.ini b/tox.ini
@@ -0,0 +1,32 @@
+[tox]
+envlist = py312, py311, py310, openmmlab
+
+[testenv]
+description = "Run tests in base environments"
+deps =
+    pytest
+    lorem
+    pytest-cov
+commands = pytest -m "not gpu"
+
+[testenv:openmmlab]
+description = "Run tests with OpenMMLab packages"
+deps =
+    mmcv @ https://github.com/Swedish-National-Archives-AI-lab/openmim_install/raw/main/mmcv-2.0.0-cp310-cp310-manylinux1_x86_64.whl
+    mmdet==3.1.0
+    mmengine==0.7.2
+    mmocr==1.0.1
+    yapf==0.40.1
+
+commands =
+    uv run pytest -m gpu {posargs}
+
+[testenv:py312]
+basepython = python3.12
+
+[testenv:py311]
+basepython = python3.11
+
+[testenv:py310]
+basepython = python3.10
+