diff --git a/.dockerignore b/.dockerignore index f59ec20..a54b3b3 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,2 @@ -* \ No newline at end of file +* +!/src \ No newline at end of file diff --git a/Dockerfile.api b/Dockerfile.api new file mode 100644 index 0000000..ea10d20 --- /dev/null +++ b/Dockerfile.api @@ -0,0 +1,26 @@ +FROM python:3.13-slim-bullseye + +USER root + +ARG INSTALL_GIT=false +RUN if [ "$INSTALL_GIT" = "true" ]; then \ + apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \ + fi + +# Runtime dependency +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +# FIXME: should use markitdown from sources +RUN pip install markitdown fastapi[standard] uvicorn + +# Default USERID and GROUPID +ARG USERID=10000 +ARG GROUPID=10000 + +USER $USERID:$GROUPID + +COPY src/markitdown/api.py /src/markitdown/api.py + +ENTRYPOINT ["uvicorn", "src.markitdown.api:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index 6bc91e6..ebd1d45 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,46 @@ print(result.text_content) docker build -t markitdown:latest . docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md ``` + +### Web API + +You can also use MarkItDown via a REST endpoint. The Web API is built using FastAPI and can be run using Docker. + +#### Running the Web API + +1. Build the Docker image: + +```sh +docker build -f Dockerfile.api -t markitdown-api:latest . +``` + +2. Run the Docker container: + +```sh +docker run --rm -p 8000:8000 markitdown-api:latest +``` + +The Web API will be available at `http://localhost:8000`. + +#### Using the Web API + +The Web API provides a single endpoint `/convert` that accepts a file and returns the converted markdown. + +- **Endpoint:** `/convert` +- **Method:** `POST` +- **Request Body:** Multipart form data with a file field named `file` +- **Response:** depends on the `Accept` header: + - `application/json` the JSON serialization of the `DocumentConverterResult`, + i.e. an object with a `text_content` field containing the converted markdown + and (optionally) a `title` field containing the title of the document + - (otherwise) a `text/markdown` response containing the converted markdown + +Example using `curl`: + +```sh +curl -X POST "http://localhost:8000/convert" -F "file=@path-to-file.pdf" +``` +
Batch Processing Multiple Files diff --git a/src/markitdown/api.py b/src/markitdown/api.py new file mode 100644 index 0000000..b4729fe --- /dev/null +++ b/src/markitdown/api.py @@ -0,0 +1,55 @@ +from mimetypes import guess_extension +from multiprocessing import Pool +from os.path import splitext +from shutil import copyfileobj +from tempfile import NamedTemporaryFile +from fastapi import FastAPI, Request, UploadFile, HTTPException +from fastapi.responses import JSONResponse, Response +from markitdown import MarkItDown + + +def convert_simple(local_path: str, **kwargs): + return MarkItDown().convert(local_path, **kwargs) + + +pool = Pool() + + +def convert_upload(upload_file: UploadFile): + file_extension = None + ext = None + + # Guess from the mimetype + file_extension = guess_extension(upload_file.content_type) + + # Read the extension from the filename + if upload_file.filename: + base, ext = splitext(upload_file.filename) + + # Save the file locally to a temporary file. It will be deleted before this function exits + with NamedTemporaryFile(suffix=ext) as temp: + copyfileobj(upload_file.file, temp) + temp.flush() + upload_file.file.close() + + return pool.apply(convert_simple, [temp.name], {file_extension: file_extension}) + + +app = FastAPI() + + +@app.post("/convert") +def convert(request: Request, file: UploadFile) -> Response: + if not file.filename: + raise HTTPException(status_code=400, detail="No file uploaded") + + try: + result = convert_upload(file) + + if request.headers.get("Accept") == "application/json": + return JSONResponse(content=result) + else: + return Response(content=result.text_content, media_type="text/markdown") + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e))