Skip to content

Commit

Permalink
Rewrite history with initial merge from F&T
Browse files Browse the repository at this point in the history
  • Loading branch information
lindenmckenzie committed Jun 13, 2023
1 parent 5db80b7 commit 3be65b2
Show file tree
Hide file tree
Showing 42 changed files with 2,978,986 additions and 0 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
target/
6 changes: 6 additions & 0 deletions .env.local
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
BERLIN_API_PORT = 28700
BERLIN_API_HOST = "0.0.0.0"
BERLIN_API_DATA_LOCATION = "data/"
BERLIN_API_LOGGING_NAMESPACE = "dp_nlp_berlin_api"
BERLIN_API_GIT_COMMIT = "000000"
BERLIN_API_VERSION = "0.1.0"
23 changes: 23 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Environmental files
.env
*.example

# Caches
__pycache__
.pytest_cache

# MAC
.idea
.DS_Store

# Configs
config.ini

# Generated files
/dist

# Others
flamegraph.svg
target/
.coverage
run_lint.sh
24 changes: 24 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
FROM python:3.10-slim

RUN apt-get update && apt-get install --no-install-recommends -y git \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

RUN pip install poetry
RUN poetry config virtualenvs.create false

RUN mkdir -p /usr/src/
WORKDIR /usr/src/

COPY app /usr/src/app
COPY data /usr/src/data
COPY api.py poetry.lock pyproject.toml /usr/src/

RUN poetry install --no-dev

EXPOSE 28900

ENV FLASK_APP=api.py

ENTRYPOINT flask run --host 0.0.0.0 --port 28900

11 changes: 11 additions & 0 deletions Dockerfile.concourse
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM python:3.10

RUN mkdir data
COPY data data/

COPY dist/dp_nlp_berlin_api-0.1.0-py3-none-any.whl .

RUN pip install dp_nlp_berlin_api-0.1.0-py3-none-any.whl

CMD ["python", "-m", "app.main"]

69 changes: 69 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
GREEN := $(shell tput -Txterm setaf 2)
YELLOW := $(shell tput -Txterm setaf 3)
WHITE := $(shell tput -Txterm setaf 7)
CYAN := $(shell tput -Txterm setaf 6)
RESET := $(shell tput -Txterm sgr0)

EXISTS_POETRY := $(shell command -v poetry 2> /dev/null)
EXISTS_FLASK := $(shell command -v uvicorn 2> /dev/null)

export BERLIN_API_PORT ?= 28900
export BERLIN_API_HOST ?= 0.0.0.0
export FLASK_APP ?= app/main.py

export START_TIME=$(shell date +%s)
export BERLIN_API_GIT_COMMIT=$(shell git rev-parse HEAD)
export BERLIN_API_VERSION ?= 0.1.0

.PHONY: build build-bin run lint test help audit deps all test-component

all: audit lint format

audit: deps ## Makes sure dep are installed and audits code for vulnerable dependencies
poetry run safety check -i 51457

build: deps
docker build --build-arg start_time="${START_TIME}" --build-arg commit="${GIT_COMMIT}" --build-arg version="${VERSION}" -t berlin_api .

build-bin: deps
poetry build

deps: ## Installs dependencies
@if [ -z "$(EXISTS_FLASK)" ]; then \
if [ -z "$(EXISTS_POETRY)" ]; then \
pip -qq install poetry; \
poetry config virtualenvs.in-project true; \
fi; \
poetry install --quiet || poetry install; \
fi; \

lint: deps ## Lints code
poetry run ruff .

run: deps ## Start the api locally on port 28900.
FLASK_APP=${FLASK_APP} poetry run flask run --port ${BERLIN_API_PORT}

run-container: deps
docker run --env START_TIME='${START_TIME}' -e GIT_COMMIT="${GIT_COMMIT}" -e VERSION="${VERSION}" -ti berlin_api

test: deps ## Runs all available tests and generates a coverage report located in htmlcov
poetry run ./scripts/run_tests_unit.sh

test-component: deps ## Makes sure dep are installed and runs component tests
poetry run pytest tests/api

format: deps ## Formats your code automatically.
poetry run isort .
poetry run black .

help: ## Show this help.
@echo ''
@echo 'Usage:'
@echo ' ${YELLOW}make${RESET} ${GREEN}<target>${RESET}'
@echo ''
@echo 'Targets:'
@awk 'BEGIN {FS = ":.*?## "} { \
if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf " ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \
else if (/^## .*$$/) {printf " ${CYAN}%s${RESET}\n", substr($$1,4)} \
}' $(MAKEFILE_LIST)

139 changes: 139 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# dp-nlp-berlin-api

A Python microservice to wrap the Berlin package for identifying locations and tagging them with UN-LOCODEs and
ISO-3166-2 subdivisions.

## Setup

It is recommended that you use [Pyenv](https://github.com/pyenv/pyenv) to manage your Python installations.

### Configuration

| Environment variable | Default | Description
| ---------------------------- | --------- | -----------
| FLASK_APP | `app/main.py` | The data files with the areas
| BERLIN_API_PORT | 28900 | The port to bind to
| BERLIN_API_HOST | `0.0.0.0` | The host to bind to
| BERLIN_API_DATA_LOCATION | "data/" | Data location
| BERLIN_API_LOGGING_NAMESPACE | "dp_nlp_berlin_api" | Logging namespace
| BERLIN_API_GIT_COMMIT | "000000" | Git commit
| BERLIN_API_VERSION | "0.1.0" | version

### Install Poetry
```
curl -sSL https://install.python-poetry.org | python3 -
poetry install
```

## Running

To run the app:

```
make run
```

## Testing

By default, all schemas in the `tests/schemas/valid` directory will be evaluated as part of the unit tests.
Any errors in these schemas will cause a failure.

To run the app's unit tests:

```
make test
```

To test the apps functionality:
```
make run
```

Then, in another terminal window/tab, navigate to a checked out copy of ONS/eq-survey-runner:

```
make test
```

## Usage

This will make an API available on port 28900. It serves simple requests of the
form:

```shell
curl 'http://localhost:28900/v1/berlin/search?q=house+prices+in+londo&state=gb' | jq
```

replacing `localhost` with the local endpoint (`jq` used for formatting).

This will return results of the form:

```json
{
"matches": [
{
"encoding": "UN-LOCODE",
"id": "ca:lod",
"key": "UN-LOCODE-ca:lod",
"words": [
"london"
]
},
{
"encoding": "UN-LOCODE",
"id": "us:ldn",
"key": "UN-LOCODE-us:ldn",
"words": [
"london"
]
}
...
]
}
```


## Description

Berlin is a location search engine which works on an in-memory collection of
all UN Locodes, subdivisions and states (countries). Here are the main
architectural highlights: On startup Berlin does a basic linguistic analysis of
the locations: split names into words, remove diacritics, transliterate
non-ASCII symbols to ASCII. For example, this allows us to find “Las Vegas”
when searching for “vegas”. It employs string interning in order to both
optimise memory usage and allow direct lookups for exact matches. If we can
resolve (parts of) the search term to an existing interned string, it means
that we have a location with this name in the database.

When the user submits the search term, Berlin first does a preliminary analysis
of the search term: 1) split into words and pairs of words 2) try to identify
the former as existing locations (can be resolved to existing interned strings)
and tag them as “exact matches”. This creates many search terms from the
original phrase. Pre-filtering step. Here we do three things 1) resolve exact
matches by direct lookup in the names and codes tables 2) do a prefix search
via a finite-state transducer 3) do a fuzzy search via a Levenshtein distance
enabled finite-state transducer. The pre-filtered results are passed through a
string-similarity evaluation algorithm and sorted by score. The results below a
threshold are truncated. A graph is built from the locations found during the
previous step in order to link them together hierarchically if possible. This
further boosts some locations. For example, if the user searches for “new york
UK” it will boost the location in Lincolnshire and it will show up higher than
New York city in the USA. It is also possible to request search only in a
specific country (which is enabled by default for the UK)

Berlin is able to find locations with a high degree of semantic accuracy. Speed
is roughly equal to 10-15 ms per every non-matching word (or typo) + 1 ms for
every exact match. A complex query of 8 words usually takes less than 100 ms
and all of the realistic queries in our test suite take less than 50 ms, while
the median is under 30 ms. Short queries containing an exact match (case
insensitive) are faster than 10 ms.

The architecture would allow to easily implement as-you-type search suggestions
in under 10 milliseconds if deemed desirable.


### License

Prepared by Flax & Teal Limited for ONS Digital (see LICENSE).
This API is based on [eq-questionnaire-validator](https://github.com/ONSdigital/eq-questionnaire-validator), a tool
from ONS Digital.
Empty file added app/__init__.py
Empty file.
25 changes: 25 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import time
from dynaconf import Dynaconf

current_time = int(time.time())
print(current_time)

settings = Dynaconf(
envvar_prefix="BERLIN_API",
load_dotenv=True,
)

settings.reload()

HOST = settings.get("BERLIN_API_HOST", "0.0.0.0")
PORT = settings.get("BERLIN_API_PORT", 28700)

NAMESPACE = settings.get("LOGGING_NAMESPACE", "dp_nlp_berlin_api")
DATA_LOCATION = settings.get("DATA_LOCATION", "data/")

# VERSION = settings.get("VERSION", "0.1.0")
VERSION = "0.1.0"

# BERLIN_API_START_TIME name of the start_time variable
START_TIME = settings.get("START_TIME", current_time)
GIT_COMMIT = settings.get("GIT_COMMIT", "last_commit")
48 changes: 48 additions & 0 deletions app/healthcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from app.config import START_TIME, GIT_COMMIT, VERSION
import sys
import time
from datetime import datetime

# Define the check statuses
OK = "OK"
WARNING = "WARNING"
ERROR = "ERROR"


class Healthcheck:
def __init__(self, status, checks):
formatted_start_time = datetime.fromtimestamp(int(START_TIME))

build_time = datetime.now()

self.start_time = formatted_start_time.strftime('%Y-%m-%dT%H:%M:%S%z')

self.status = status
self.version = {
"version": VERSION,
"build_time": build_time,
"git_commit": GIT_COMMIT,
"language": "python",
"language_version": sys.version,
}
self.checks = checks

def to_json(self):
response = {
"status": self.status,
"version": self.version,
"uptime": self.get_uptime(),
"start_time": self.start_time,
"checks": self.checks,
}

return response

def get_uptime(self):
uptime = time.time()
start_time = datetime.fromisoformat(self.start_time)
start_time_unix = int(start_time.timestamp())

uptime = round((uptime - start_time_unix) * 1000)

return uptime
21 changes: 21 additions & 0 deletions app/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from datetime import datetime
from app.config import NAMESPACE
import structlog


def configure_logging():
structlog.configure(
processors=[
structlog.processors.TimeStamper(fmt="iso", utc=True),
structlog.processors.JSONRenderer(),
],
context_class=structlog.threadlocal.wrap_dict(dict),
logger_factory=structlog.stdlib.LoggerFactory(),
)


def setup_logger():
return structlog.get_logger(
namespace=NAMESPACE,
created_at=datetime.utcnow().isoformat(),
)
17 changes: 17 additions & 0 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from flask import Flask

from app.config import PORT, HOST
from app.views.berlin import berlin_blueprint
from app.views.health import health_blueprint


def create_app():
application = Flask(__name__)
application.register_blueprint(berlin_blueprint)
application.register_blueprint(health_blueprint)
return application


if __name__ == "__main__":
application = create_app()
application.run(port=PORT, host=HOST)
Loading

0 comments on commit 3be65b2

Please sign in to comment.