-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rewrite history with initial merge from F&T
- Loading branch information
1 parent
5db80b7
commit 3be65b2
Showing
42 changed files
with
2,978,986 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
target/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
BERLIN_API_PORT = 28700 | ||
BERLIN_API_HOST = "0.0.0.0" | ||
BERLIN_API_DATA_LOCATION = "data/" | ||
BERLIN_API_LOGGING_NAMESPACE = "dp_nlp_berlin_api" | ||
BERLIN_API_GIT_COMMIT = "000000" | ||
BERLIN_API_VERSION = "0.1.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# Environmental files | ||
.env | ||
*.example | ||
|
||
# Caches | ||
__pycache__ | ||
.pytest_cache | ||
|
||
# MAC | ||
.idea | ||
.DS_Store | ||
|
||
# Configs | ||
config.ini | ||
|
||
# Generated files | ||
/dist | ||
|
||
# Others | ||
flamegraph.svg | ||
target/ | ||
.coverage | ||
run_lint.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
FROM python:3.10-slim | ||
|
||
RUN apt-get update && apt-get install --no-install-recommends -y git \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
RUN pip install poetry | ||
RUN poetry config virtualenvs.create false | ||
|
||
RUN mkdir -p /usr/src/ | ||
WORKDIR /usr/src/ | ||
|
||
COPY app /usr/src/app | ||
COPY data /usr/src/data | ||
COPY api.py poetry.lock pyproject.toml /usr/src/ | ||
|
||
RUN poetry install --no-dev | ||
|
||
EXPOSE 28900 | ||
|
||
ENV FLASK_APP=api.py | ||
|
||
ENTRYPOINT flask run --host 0.0.0.0 --port 28900 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
FROM python:3.10 | ||
|
||
RUN mkdir data | ||
COPY data data/ | ||
|
||
COPY dist/dp_nlp_berlin_api-0.1.0-py3-none-any.whl . | ||
|
||
RUN pip install dp_nlp_berlin_api-0.1.0-py3-none-any.whl | ||
|
||
CMD ["python", "-m", "app.main"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
GREEN := $(shell tput -Txterm setaf 2) | ||
YELLOW := $(shell tput -Txterm setaf 3) | ||
WHITE := $(shell tput -Txterm setaf 7) | ||
CYAN := $(shell tput -Txterm setaf 6) | ||
RESET := $(shell tput -Txterm sgr0) | ||
|
||
EXISTS_POETRY := $(shell command -v poetry 2> /dev/null) | ||
EXISTS_FLASK := $(shell command -v uvicorn 2> /dev/null) | ||
|
||
export BERLIN_API_PORT ?= 28900 | ||
export BERLIN_API_HOST ?= 0.0.0.0 | ||
export FLASK_APP ?= app/main.py | ||
|
||
export START_TIME=$(shell date +%s) | ||
export BERLIN_API_GIT_COMMIT=$(shell git rev-parse HEAD) | ||
export BERLIN_API_VERSION ?= 0.1.0 | ||
|
||
.PHONY: build build-bin run lint test help audit deps all test-component | ||
|
||
all: audit lint format | ||
|
||
audit: deps ## Makes sure dep are installed and audits code for vulnerable dependencies | ||
poetry run safety check -i 51457 | ||
|
||
build: deps | ||
docker build --build-arg start_time="${START_TIME}" --build-arg commit="${GIT_COMMIT}" --build-arg version="${VERSION}" -t berlin_api . | ||
|
||
build-bin: deps | ||
poetry build | ||
|
||
deps: ## Installs dependencies | ||
@if [ -z "$(EXISTS_FLASK)" ]; then \ | ||
if [ -z "$(EXISTS_POETRY)" ]; then \ | ||
pip -qq install poetry; \ | ||
poetry config virtualenvs.in-project true; \ | ||
fi; \ | ||
poetry install --quiet || poetry install; \ | ||
fi; \ | ||
|
||
lint: deps ## Lints code | ||
poetry run ruff . | ||
|
||
run: deps ## Start the api locally on port 28900. | ||
FLASK_APP=${FLASK_APP} poetry run flask run --port ${BERLIN_API_PORT} | ||
|
||
run-container: deps | ||
docker run --env START_TIME='${START_TIME}' -e GIT_COMMIT="${GIT_COMMIT}" -e VERSION="${VERSION}" -ti berlin_api | ||
|
||
test: deps ## Runs all available tests and generates a coverage report located in htmlcov | ||
poetry run ./scripts/run_tests_unit.sh | ||
|
||
test-component: deps ## Makes sure dep are installed and runs component tests | ||
poetry run pytest tests/api | ||
|
||
format: deps ## Formats your code automatically. | ||
poetry run isort . | ||
poetry run black . | ||
|
||
help: ## Show this help. | ||
@echo '' | ||
@echo 'Usage:' | ||
@echo ' ${YELLOW}make${RESET} ${GREEN}<target>${RESET}' | ||
@echo '' | ||
@echo 'Targets:' | ||
@awk 'BEGIN {FS = ":.*?## "} { \ | ||
if (/^[a-zA-Z_-]+:.*?##.*$$/) {printf " ${YELLOW}%-20s${GREEN}%s${RESET}\n", $$1, $$2} \ | ||
else if (/^## .*$$/) {printf " ${CYAN}%s${RESET}\n", substr($$1,4)} \ | ||
}' $(MAKEFILE_LIST) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# dp-nlp-berlin-api | ||
|
||
A Python microservice to wrap the Berlin package for identifying locations and tagging them with UN-LOCODEs and | ||
ISO-3166-2 subdivisions. | ||
|
||
## Setup | ||
|
||
It is recommended that you use [Pyenv](https://github.com/pyenv/pyenv) to manage your Python installations. | ||
|
||
### Configuration | ||
|
||
| Environment variable | Default | Description | ||
| ---------------------------- | --------- | ----------- | ||
| FLASK_APP | `app/main.py` | The data files with the areas | ||
| BERLIN_API_PORT | 28900 | The port to bind to | ||
| BERLIN_API_HOST | `0.0.0.0` | The host to bind to | ||
| BERLIN_API_DATA_LOCATION | "data/" | Data location | ||
| BERLIN_API_LOGGING_NAMESPACE | "dp_nlp_berlin_api" | Logging namespace | ||
| BERLIN_API_GIT_COMMIT | "000000" | Git commit | ||
| BERLIN_API_VERSION | "0.1.0" | version | ||
|
||
### Install Poetry | ||
``` | ||
curl -sSL https://install.python-poetry.org | python3 - | ||
poetry install | ||
``` | ||
|
||
## Running | ||
|
||
To run the app: | ||
|
||
``` | ||
make run | ||
``` | ||
|
||
## Testing | ||
|
||
By default, all schemas in the `tests/schemas/valid` directory will be evaluated as part of the unit tests. | ||
Any errors in these schemas will cause a failure. | ||
|
||
To run the app's unit tests: | ||
|
||
``` | ||
make test | ||
``` | ||
|
||
To test the apps functionality: | ||
``` | ||
make run | ||
``` | ||
|
||
Then, in another terminal window/tab, navigate to a checked out copy of ONS/eq-survey-runner: | ||
|
||
``` | ||
make test | ||
``` | ||
|
||
## Usage | ||
|
||
This will make an API available on port 28900. It serves simple requests of the | ||
form: | ||
|
||
```shell | ||
curl 'http://localhost:28900/v1/berlin/search?q=house+prices+in+londo&state=gb' | jq | ||
``` | ||
|
||
replacing `localhost` with the local endpoint (`jq` used for formatting). | ||
|
||
This will return results of the form: | ||
|
||
```json | ||
{ | ||
"matches": [ | ||
{ | ||
"encoding": "UN-LOCODE", | ||
"id": "ca:lod", | ||
"key": "UN-LOCODE-ca:lod", | ||
"words": [ | ||
"london" | ||
] | ||
}, | ||
{ | ||
"encoding": "UN-LOCODE", | ||
"id": "us:ldn", | ||
"key": "UN-LOCODE-us:ldn", | ||
"words": [ | ||
"london" | ||
] | ||
} | ||
... | ||
] | ||
} | ||
``` | ||
|
||
|
||
## Description | ||
|
||
Berlin is a location search engine which works on an in-memory collection of | ||
all UN Locodes, subdivisions and states (countries). Here are the main | ||
architectural highlights: On startup Berlin does a basic linguistic analysis of | ||
the locations: split names into words, remove diacritics, transliterate | ||
non-ASCII symbols to ASCII. For example, this allows us to find “Las Vegas” | ||
when searching for “vegas”. It employs string interning in order to both | ||
optimise memory usage and allow direct lookups for exact matches. If we can | ||
resolve (parts of) the search term to an existing interned string, it means | ||
that we have a location with this name in the database. | ||
|
||
When the user submits the search term, Berlin first does a preliminary analysis | ||
of the search term: 1) split into words and pairs of words 2) try to identify | ||
the former as existing locations (can be resolved to existing interned strings) | ||
and tag them as “exact matches”. This creates many search terms from the | ||
original phrase. Pre-filtering step. Here we do three things 1) resolve exact | ||
matches by direct lookup in the names and codes tables 2) do a prefix search | ||
via a finite-state transducer 3) do a fuzzy search via a Levenshtein distance | ||
enabled finite-state transducer. The pre-filtered results are passed through a | ||
string-similarity evaluation algorithm and sorted by score. The results below a | ||
threshold are truncated. A graph is built from the locations found during the | ||
previous step in order to link them together hierarchically if possible. This | ||
further boosts some locations. For example, if the user searches for “new york | ||
UK” it will boost the location in Lincolnshire and it will show up higher than | ||
New York city in the USA. It is also possible to request search only in a | ||
specific country (which is enabled by default for the UK) | ||
|
||
Berlin is able to find locations with a high degree of semantic accuracy. Speed | ||
is roughly equal to 10-15 ms per every non-matching word (or typo) + 1 ms for | ||
every exact match. A complex query of 8 words usually takes less than 100 ms | ||
and all of the realistic queries in our test suite take less than 50 ms, while | ||
the median is under 30 ms. Short queries containing an exact match (case | ||
insensitive) are faster than 10 ms. | ||
|
||
The architecture would allow to easily implement as-you-type search suggestions | ||
in under 10 milliseconds if deemed desirable. | ||
|
||
|
||
### License | ||
|
||
Prepared by Flax & Teal Limited for ONS Digital (see LICENSE). | ||
This API is based on [eq-questionnaire-validator](https://github.com/ONSdigital/eq-questionnaire-validator), a tool | ||
from ONS Digital. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
import time | ||
from dynaconf import Dynaconf | ||
|
||
current_time = int(time.time()) | ||
print(current_time) | ||
|
||
settings = Dynaconf( | ||
envvar_prefix="BERLIN_API", | ||
load_dotenv=True, | ||
) | ||
|
||
settings.reload() | ||
|
||
HOST = settings.get("BERLIN_API_HOST", "0.0.0.0") | ||
PORT = settings.get("BERLIN_API_PORT", 28700) | ||
|
||
NAMESPACE = settings.get("LOGGING_NAMESPACE", "dp_nlp_berlin_api") | ||
DATA_LOCATION = settings.get("DATA_LOCATION", "data/") | ||
|
||
# VERSION = settings.get("VERSION", "0.1.0") | ||
VERSION = "0.1.0" | ||
|
||
# BERLIN_API_START_TIME name of the start_time variable | ||
START_TIME = settings.get("START_TIME", current_time) | ||
GIT_COMMIT = settings.get("GIT_COMMIT", "last_commit") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
from app.config import START_TIME, GIT_COMMIT, VERSION | ||
import sys | ||
import time | ||
from datetime import datetime | ||
|
||
# Define the check statuses | ||
OK = "OK" | ||
WARNING = "WARNING" | ||
ERROR = "ERROR" | ||
|
||
|
||
class Healthcheck: | ||
def __init__(self, status, checks): | ||
formatted_start_time = datetime.fromtimestamp(int(START_TIME)) | ||
|
||
build_time = datetime.now() | ||
|
||
self.start_time = formatted_start_time.strftime('%Y-%m-%dT%H:%M:%S%z') | ||
|
||
self.status = status | ||
self.version = { | ||
"version": VERSION, | ||
"build_time": build_time, | ||
"git_commit": GIT_COMMIT, | ||
"language": "python", | ||
"language_version": sys.version, | ||
} | ||
self.checks = checks | ||
|
||
def to_json(self): | ||
response = { | ||
"status": self.status, | ||
"version": self.version, | ||
"uptime": self.get_uptime(), | ||
"start_time": self.start_time, | ||
"checks": self.checks, | ||
} | ||
|
||
return response | ||
|
||
def get_uptime(self): | ||
uptime = time.time() | ||
start_time = datetime.fromisoformat(self.start_time) | ||
start_time_unix = int(start_time.timestamp()) | ||
|
||
uptime = round((uptime - start_time_unix) * 1000) | ||
|
||
return uptime |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from datetime import datetime | ||
from app.config import NAMESPACE | ||
import structlog | ||
|
||
|
||
def configure_logging(): | ||
structlog.configure( | ||
processors=[ | ||
structlog.processors.TimeStamper(fmt="iso", utc=True), | ||
structlog.processors.JSONRenderer(), | ||
], | ||
context_class=structlog.threadlocal.wrap_dict(dict), | ||
logger_factory=structlog.stdlib.LoggerFactory(), | ||
) | ||
|
||
|
||
def setup_logger(): | ||
return structlog.get_logger( | ||
namespace=NAMESPACE, | ||
created_at=datetime.utcnow().isoformat(), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from flask import Flask | ||
|
||
from app.config import PORT, HOST | ||
from app.views.berlin import berlin_blueprint | ||
from app.views.health import health_blueprint | ||
|
||
|
||
def create_app(): | ||
application = Flask(__name__) | ||
application.register_blueprint(berlin_blueprint) | ||
application.register_blueprint(health_blueprint) | ||
return application | ||
|
||
|
||
if __name__ == "__main__": | ||
application = create_app() | ||
application.run(port=PORT, host=HOST) |
Oops, something went wrong.