diff --git a/docs/api/create_mapping.md b/docs/api/create_mapping.md
new file mode 100644
index 0000000..452274a
--- /dev/null
+++ b/docs/api/create_mapping.md
@@ -0,0 +1,16 @@
+# Mapping Functions
+
+The following functions can be used to create the intermediate mapping CSV required to generate a parser
+
+```{eval-rst}
+.. autofunction:: autoparser.create_mapping
+```
+
+## Class definitions
+
+You can also interact with the base class `Mapper`
+
+```{eval-rst}
+.. autoclass:: autoparser.Mapper
+ :members:
+```
\ No newline at end of file
diff --git a/docs/api/dict_writer.md b/docs/api/dict_writer.md
new file mode 100644
index 0000000..15a5c5c
--- /dev/null
+++ b/docs/api/dict_writer.md
@@ -0,0 +1,20 @@
+# Data Dictionary Functions
+
+The following functions can be used to create and add descriptions to a data dictionary
+
+```{eval-rst}
+.. autofunction:: autoparser.create_dict
+ :noindex:
+
+.. autofunction:: autoparser.generate_descriptions
+ :noindex:
+```
+
+## Class definitions
+
+You can also interact with the base class `DictWriter`
+
+```{eval-rst}
+.. autoclass:: autoparser.DictWriter
+ :members:
+```
\ No newline at end of file
diff --git a/docs/api/index.md b/docs/api/index.md
new file mode 100644
index 0000000..4a0622e
--- /dev/null
+++ b/docs/api/index.md
@@ -0,0 +1,9 @@
+# API
+
+This section describes the public API for AutoParser
+
+```{toctree}
+dict_writer
+create_mapping
+make_toml
+```
\ No newline at end of file
diff --git a/docs/api/make_toml.md b/docs/api/make_toml.md
new file mode 100644
index 0000000..81b1fa6
--- /dev/null
+++ b/docs/api/make_toml.md
@@ -0,0 +1,17 @@
+# Parser Functions
+
+The following functions can be used to create the final TOML parser file
+
+```{eval-rst}
+.. autofunction:: autoparser.create_parser
+ :noindex:
+```
+
+## Class definitions
+
+You can also interact with the base class `parserGenerator`
+
+```{eval-rst}
+.. autoclass:: autoparser.ParserGenerator
+ :members:
+```
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index e3ae7ed..d1f3827 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -32,4 +32,10 @@
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = "sphinx_book_theme"
html_logo = "images/logo.png"
+html_title = "AutoParser"
html_static_path = ["_static"]
+
+html_theme_options = {
+ "repository_url": "https://github.com/globaldothealth/autoparser",
+ "use_repository_button": True,
+}
diff --git a/docs/examples/cli_example.md b/docs/examples/cli_example.md
new file mode 100644
index 0000000..eef22b4
--- /dev/null
+++ b/docs/examples/cli_example.md
@@ -0,0 +1,47 @@
+# CLI Parser construction
+
+This file describes how to run the same parser generation pipeline as described in the
+[parser construction](example) notebook, but using the command line interface. It
+constructs a parser file for an `animals.csv` file of test data, and assumes all commands
+are run from the root of the `autoparser` package.
+
+Note: As a reminder, you will need an API key for OpenAI or Google. This example uses the OpenAI LLM.
+
+## Generate a data dictionary
+In this example, we will generate a data dictionary with descriptions already added in one step. The CLI command follows this syntax:
+
+
+```bash
+autoparser create-dict data language [-d] [-k api_key] [-l llm_choice] [-c config_file] [-o output_name]
+```
+so for the `animal_data.csv` data we will run this command to generate a data dictionary
+with descriptions
+
+```bash
+autoparser create-dict tests/sources/animal_data.csv "fr" -d -k $OPENAI_API_KEY -c tests/test_config.toml -o "animal_dd"
+```
+This creates an `animals_dd.csv` data dictionary to use in the next step.
+
+## Create intermediate mapping file
+The next step is to create an intermediate CSV for you to inspect, mapping the fields and values in the raw data to the target schema. This is the CLI syntax:
+
+```bash
+autoparser create-mapping dictionary schema language api_key [-l llm_choice] [-c config_file] [-o output_name]
+```
+so we can run
+```bash
+autoparser create-mapping animal_dd.csv tests/schemas/animals.schema.json "fr" $OPENAI_API_KEY -c tests/test_config.toml -o animal_mapping
+```
+to create the intermediate mapping file `animal_mapping.csv` for you to inspect for any errors.
+
+## Write the parser file
+Finally, the parser file for ADTL should be written out based on the contents of `animal_mapping.csv`. Once you've mande any changes to the mapping you want, we can use the `create_parser` command
+
+```bash
+autoparser create-parser mapping schema_path [-n parser_name] [--description parser_description] [-c config_file]
+```
+as
+```bash
+autoparser create-parser animal_mapping.csv tests/schemas -n animal_parser -c tests/test_config.toml
+```
+which writes out the TOML parser as `animal_parser.toml` ready for use in ADTL.
\ No newline at end of file
diff --git a/docs/examples/example.ipynb b/docs/examples/example.ipynb
new file mode 100644
index 0000000..e71f3fd
--- /dev/null
+++ b/docs/examples/example.ipynb
@@ -0,0 +1,645 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Parser construction example"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This file demonstrates the process of constructing a parser file using `animals.csv` as a source dataset.\n",
+ "\n",
+ "Before you start: `autoparser` requires an OpenAI API key to function. You should add yours to your environment, as described [here](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety). \n",
+ "Edit the `API_KEY` line below to match the name you gave yours."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import autoparser\n",
+ "import pandas as pd\n",
+ "import os\n",
+ "API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n",
+ "\n",
+ "# The path to the configuration file to use\n",
+ "config_path = \"../../tests/test_config.toml\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Identité | \n",
+ " Province | \n",
+ " DateNotification | \n",
+ " Classicfication | \n",
+ " Nom complet | \n",
+ " Date de naissance | \n",
+ " AgeAns | \n",
+ " AgeMois | \n",
+ " Sexe | \n",
+ " StatusCas | \n",
+ " DateDec | \n",
+ " ContSoins | \n",
+ " ContHumain Autre | \n",
+ " AutreContHumain | \n",
+ " ContactAnimal | \n",
+ " Micropucé | \n",
+ " AnimalDeCompagnie | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " A001 | \n",
+ " Equateur | \n",
+ " 2024-01-01 | \n",
+ " Mammifère | \n",
+ " Luna | \n",
+ " 15/03/2022 | \n",
+ " 2 | \n",
+ " 10 | \n",
+ " f | \n",
+ " Vivant | \n",
+ " NaN | \n",
+ " Oui | \n",
+ " Non | \n",
+ " Non | \n",
+ " Oui | \n",
+ " Oui | \n",
+ " Oui | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " B002 | \n",
+ " Equateur | \n",
+ " 2024-15-02 | \n",
+ " FISH | \n",
+ " Max | \n",
+ " 21/07/2021 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " m | \n",
+ " Décédé | \n",
+ " 2024-06-01 | \n",
+ " Non | \n",
+ " Oui | \n",
+ " Voyage | \n",
+ " Non | \n",
+ " NON | \n",
+ " Oui | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " C003 | \n",
+ " Equateur | \n",
+ " 2024-03-10 | \n",
+ " oiseau | \n",
+ " Coco | \n",
+ " 10/02/2023 | \n",
+ " 1 | \n",
+ " 11 | \n",
+ " F | \n",
+ " Vivant | \n",
+ " NaN | \n",
+ " Oui | \n",
+ " Non | \n",
+ " Non | \n",
+ " Oui | \n",
+ " Oui | \n",
+ " Non | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " D004 | \n",
+ " NaN | \n",
+ " 2024-04-22 | \n",
+ " amphibie | \n",
+ " Bella | \n",
+ " 05/11/2020 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " m | \n",
+ " Vivant | \n",
+ " NaN | \n",
+ " Oui | \n",
+ " NaN | \n",
+ " Autres | \n",
+ " Non | \n",
+ " NON | \n",
+ " Non | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " E005 | \n",
+ " NaN | \n",
+ " 2024-05-30 | \n",
+ " poisson | \n",
+ " Charlie | \n",
+ " 18/05/2019 | \n",
+ " 5 | \n",
+ " 3 | \n",
+ " F | \n",
+ " Décédé | \n",
+ " 2024-07-01 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Voyage | \n",
+ " Oui | \n",
+ " Oui | \n",
+ " Oui | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Identité Province DateNotification Classicfication Nom complet \\\n",
+ "0 A001 Equateur 2024-01-01 Mammifère Luna \n",
+ "1 B002 Equateur 2024-15-02 FISH Max \n",
+ "2 C003 Equateur 2024-03-10 oiseau Coco \n",
+ "3 D004 NaN 2024-04-22 amphibie Bella \n",
+ "4 E005 NaN 2024-05-30 poisson Charlie \n",
+ "\n",
+ " Date de naissance AgeAns AgeMois Sexe StatusCas DateDec \\\n",
+ "0 15/03/2022 2 10 f Vivant NaN \n",
+ "1 21/07/2021 3 4 m Décédé 2024-06-01 \n",
+ "2 10/02/2023 1 11 F Vivant NaN \n",
+ "3 05/11/2020 4 5 m Vivant NaN \n",
+ "4 18/05/2019 5 3 F Décédé 2024-07-01 \n",
+ "\n",
+ " ContSoins ContHumain Autre AutreContHumain ContactAnimal Micropucé \\\n",
+ "0 Oui Non Non Oui Oui \n",
+ "1 Non Oui Voyage Non NON \n",
+ "2 Oui Non Non Oui Oui \n",
+ "3 Oui NaN Autres Non NON \n",
+ "4 NaN NaN Voyage Oui Oui \n",
+ "\n",
+ " AnimalDeCompagnie \n",
+ "0 Oui \n",
+ "1 Oui \n",
+ "2 Non \n",
+ "3 Non \n",
+ "4 Oui "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data = pd.read_csv(\"../../tests/sources/animal_data.csv\")\n",
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's generate a basic data dictionary from this data set. We want to use the configuration file set up for this dataset, located in the `tests` directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Field Name | \n",
+ " Description | \n",
+ " Field Type | \n",
+ " Common Values | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Identité | \n",
+ " NaN | \n",
+ " string | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Province | \n",
+ " NaN | \n",
+ " choice | \n",
+ " Equateur, Orientale, Katanga, Kinshasa | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DateNotification | \n",
+ " NaN | \n",
+ " string | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Classicfication | \n",
+ " NaN | \n",
+ " choice | \n",
+ " FISH, amphibie, oiseau, Mammifère, poisson, RE... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Nom complet | \n",
+ " NaN | \n",
+ " string | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Field Name Description Field Type \\\n",
+ "0 Identité NaN string \n",
+ "1 Province NaN choice \n",
+ "2 DateNotification NaN string \n",
+ "3 Classicfication NaN choice \n",
+ "4 Nom complet NaN string \n",
+ "\n",
+ " Common Values \n",
+ "0 NaN \n",
+ "1 Equateur, Orientale, Katanga, Kinshasa \n",
+ "2 NaN \n",
+ "3 FISH, amphibie, oiseau, Mammifère, poisson, RE... \n",
+ "4 NaN "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "writer = autoparser.DictWriter(config_path)\n",
+ "data_dict = writer.create_dict(data)\n",
+ "data_dict.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The 'Common Values' column indicates fields where there are a limited number of unique values, suggesting mapping to a controlled terminology may have been done, or might be required in the parser. The list of common values is every unique value in the field.\n",
+ "\n",
+ "Notice that the Description column is empty. To proceed to the next step of the parser generation process, creating the mapping file linking source -> schema fields, this column must be filled. You can either do this by hand (the descriptions MUST be in english), or use autoparser's LLM functionality to do it for you, demonstrated below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " source_field | \n",
+ " source_description | \n",
+ " source_type | \n",
+ " common_values | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " Identité | \n",
+ " Identity | \n",
+ " string | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " Province | \n",
+ " Province | \n",
+ " choice | \n",
+ " Equateur, Orientale, Katanga, Kinshasa | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " DateNotification | \n",
+ " Notification Date | \n",
+ " string | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Classicfication | \n",
+ " Classification | \n",
+ " choice | \n",
+ " FISH, amphibie, oiseau, Mammifère, poisson, RE... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Nom complet | \n",
+ " Full Name | \n",
+ " string | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source_field source_description source_type \\\n",
+ "0 Identité Identity string \n",
+ "1 Province Province choice \n",
+ "2 DateNotification Notification Date string \n",
+ "3 Classicfication Classification choice \n",
+ "4 Nom complet Full Name string \n",
+ "\n",
+ " common_values \n",
+ "0 NaN \n",
+ "1 Equateur, Orientale, Katanga, Kinshasa \n",
+ "2 NaN \n",
+ "3 FISH, amphibie, oiseau, Mammifère, poisson, RE... \n",
+ "4 NaN "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dd_described = writer.generate_descriptions(\"fr\", data_dict, key=API_KEY)\n",
+ "dd_described.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we have a data dictionary with descriptions added, we can proceed to creating an intermediate mapping file:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/pipliggins/Documents/repos/autoparser/src/autoparser/create_mapping.py:258: UserWarning: The following schema fields have not been mapped: ['country_iso3', 'owner']\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " source_description | \n",
+ " source_field | \n",
+ " common_values | \n",
+ " target_values | \n",
+ " value_mapping | \n",
+ "
\n",
+ " \n",
+ " target_field | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " identity | \n",
+ " Identity | \n",
+ " Identité | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " name | \n",
+ " Full Name | \n",
+ " Nom complet | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " loc_admin_1 | \n",
+ " Province | \n",
+ " Province | \n",
+ " Equateur, Orientale, Katanga, Kinshasa | \n",
+ " NaN | \n",
+ " equateur=None, kinshasa=None, katanga=None, or... | \n",
+ "
\n",
+ " \n",
+ " country_iso3 | \n",
+ " None | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " notification_date | \n",
+ " Notification Date | \n",
+ " DateNotification | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " source_description source_field \\\n",
+ "target_field \n",
+ "identity Identity Identité \n",
+ "name Full Name Nom complet \n",
+ "loc_admin_1 Province Province \n",
+ "country_iso3 None NaN \n",
+ "notification_date Notification Date DateNotification \n",
+ "\n",
+ " common_values target_values \\\n",
+ "target_field \n",
+ "identity NaN NaN \n",
+ "name NaN NaN \n",
+ "loc_admin_1 Equateur, Orientale, Katanga, Kinshasa NaN \n",
+ "country_iso3 NaN NaN \n",
+ "notification_date NaN NaN \n",
+ "\n",
+ " value_mapping \n",
+ "target_field \n",
+ "identity NaN \n",
+ "name NaN \n",
+ "loc_admin_1 equateur=None, kinshasa=None, katanga=None, or... \n",
+ "country_iso3 NaN \n",
+ "notification_date NaN "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "mapper = autoparser.Mapper(\"../../tests/schemas/animals.schema.json\", dd_described, \"fr\", api_key=API_KEY, config=config_path)\n",
+ "mapping_dict = mapper.create_mapping(file_name='example_mapping.csv')\n",
+ "\n",
+ "mapping_dict.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "At this point, you should inspect the mapping file and look for fields/values that have been incorrectly mapped, and edit them where necessary.\n",
+ "The mapping file has been written out to [example_mapping.csv](example_mapping.csv). A good example is the 'loc_admin_1' field; the LLM often maps the common values provided to 'None' as the schema denotes this as a free-text field. Instead, delete these mapped values and the parsed data will contain the original free text.\n",
+ "Also note the warning above; the LLM should not have found fields to map to the 'country_iso3' or 'owner' fields. If the original data did contain an appropriate field for these, you should edit the mapping file accordingly."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Once you have edited the mapping file to your satisfaction, we can go ahead and create the TOML parser file, `example_parser.toml`:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "WARNING:root:Missing required field country_iso3 in animals schema. Adding empty field...\n"
+ ]
+ }
+ ],
+ "source": [
+ "writer = autoparser.ParserGenerator(\"example_mapping.csv\", \"../../tests/schemas\", \"example\", config=config_path)\n",
+ "writer.create_parser(\"example_parser.toml\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can veiw/edit the created parser at [example_parser.toml](example_parser.toml), and try it out using [ADTL](https://github.com/globaldothealth/adtl)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md
index 54ad793..ac9ebff 100644
--- a/docs/getting_started/index.md
+++ b/docs/getting_started/index.md
@@ -16,4 +16,25 @@ uv sync
. .venv/bin/activate
```
-To view and use the CLI, you can type `autoparser` into the command line to view the options available.
+To view and use the CLI, you can type `autoparser` into the command line to view the
+options available.
+
+## Other requirements
+
+AutoParser relies on LLMs to automatically map raw data fields to a target schema.
+In order to use this tool, you will need an API key for either [OpenAI](https://platform.openai.com/docs/quickstart/create-and-export-an-api-key)
+or Google's [Gemini](https://aistudio.google.com/apikey) [Dev note: work in progress!].
+AutoParser will use either OpenAI's `gpt-4-mini`, or Google's `gemini-1.5-flash`.
+
+The LLM should *never* see your raw data; only the data dictionary which contains
+column headers, and text descriptions of what each field shoud contain.
+
+### Supported file formats
+Autoparser supports CSV and XLSX formats for raw data and data dictionary files, and either
+JSON or TOML for the target schema.
+
+## Quickstart
+
+See the example notebook [here](../examples/example.ipynb) for a basic walk through the
+functionality of AutoParser.
+
diff --git a/docs/index.md b/docs/index.md
index 3a3f748..b1093e9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,10 +2,7 @@
AutoParser is a tool for semi-automated data parser creation. The package allows you
to generate a new data parser for converting your source data into a new format specified
using a schema file, ready to use with the data transformation tool [adtl](https://adtl.readthedocs.io/en/latest/index.html).
-
-## Key Features
-- Data Dictionary Creation: Automatically create a basic data dictionary framework
-- Parser Generation: Generate data parsers to match a given schema
+Both programmatic and CLI usage are provided for.
## Framework
@@ -15,13 +12,39 @@ Flowchart showing the inputs (bright blue), outputs (green blocks) and functions
```
## Documentation
+
+how to get started with AutoParser, and describes the API
+
```{toctree}
---
maxdepth: 2
-caption: Contents:
+caption: Documentation
---
-self
getting_started/index
-usage/data_dict
-usage/parser_generation
+usage/index
+```
+
+## Example pages
+
+Examples demonstrating usage of this tool
+
+```{toctree}
+---
+maxdepth: 2
+caption: Examples
+---
+examples/example
+examples/cli_example
+```
+
+## API reference
+
+Documents the full API
+
+```{toctree}
+---
+maxdepth: 2
+caption: API
+---
+api/index
```
\ No newline at end of file
diff --git a/docs/usage/data_dict.md b/docs/usage/data_dict.md
index 56b94fb..92b70d6 100644
--- a/docs/usage/data_dict.md
+++ b/docs/usage/data_dict.md
@@ -7,13 +7,13 @@ It should contain, at minimum, a list of field/column names, and some kind of de
of what data each field holds. This often takes the form of a textual description, plus
a note of the data type (text, decimals, date, boolean...) and/or a set of expected values.
-A data dictionary is required by AutoParser for (parser generation)[parser_generation].
+A data dictionary is required by AutoParser for [parser generation](parser_generation).
This is to avoid having to send potentially sensitive or confidential data to an external
body (in this case an externally hosted LLM hosted); instead a *decription* of what the
data looks like from the dictionary can be sent to the LLM, which allows for mapping to
occur without risking the unintentional release of data.
-Many data capture services such as (REDCaP)[https://projectredcap.org/] will generate
+Many data capture services such as [REDCaP](https://projectredcap.org/) will generate
a data dictionary automatically when surveys are set up. However, where data is being
captured either rapidly, or by individuals/small teams, a formal data dictionary may not
have been created for a corresponding dataset. For this scenario, AutoParser provides
@@ -54,10 +54,8 @@ to generate a data parser.
```{eval-rst}
.. autofunction:: autoparser.create_dict
- :noindex:
.. autofunction:: autoparser.generate_descriptions
- :noindex:
```
diff --git a/docs/usage/index.md b/docs/usage/index.md
new file mode 100644
index 0000000..7b7293b
--- /dev/null
+++ b/docs/usage/index.md
@@ -0,0 +1,8 @@
+# Usage
+
+These sections describe the key usage patterns for AutoParser.
+
+```{toctree}
+data_dict
+parser_generation
+```
\ No newline at end of file
diff --git a/docs/usage/parser_generation.md b/docs/usage/parser_generation.md
index e5b17a8..e4a97ba 100644
--- a/docs/usage/parser_generation.md
+++ b/docs/usage/parser_generation.md
@@ -1,13 +1,13 @@
# Write a Data Parser
-AutoParser assumes the use of Global.Health's (adtl)[https://github.com/globaldothealth/adtl]
+AutoParser assumes the use of Global.Health's [adtl](https://github.com/globaldothealth/adtl)
package to transform your source data into a standardised format. To do this, adtl requires a
-(TOML)[https://toml.io/en/] specification file which describes how raw data should be
+[TOML](https://toml.io/en/) specification file which describes how raw data should be
converted into the new format, on a field-by-field basis. Every unique data file format
(i.e. unique sets of fields and data types) should have a corresponding parser file.
AutoParser exists to semi-automate the process of writing new parser files. This requires
-a data dictionary (which can be created if it does not already exist, see [data_dict]),
+a data dictionary (which can be created if it does not already exist, see '[Create Data dictionary](data_dict)'),
and the JSON schema of the target format.
Parser generation is a 2-step process.
diff --git a/src/autoparser/create_mapping.py b/src/autoparser/create_mapping.py
index 3c4ea93..ab2e7be 100644
--- a/src/autoparser/create_mapping.py
+++ b/src/autoparser/create_mapping.py
@@ -67,6 +67,7 @@ def __init__(
@property
def target_fields(self) -> list[str]:
+ """Returns a list of fields in the target schema"""
try:
return self._target_fields
except AttributeError:
@@ -75,6 +76,7 @@ def target_fields(self) -> list[str]:
@property
def target_types(self) -> dict[str, list[str]]:
+ """Returns the field types of the target schema"""
try:
return self._target_types
except AttributeError:
@@ -86,6 +88,7 @@ def target_types(self) -> dict[str, list[str]]:
@property
def target_values(self) -> pd.Series:
+ """Returns the enum values or boolean options for the target schema"""
try:
return self._target_values
except AttributeError:
@@ -106,6 +109,10 @@ def _value_options(f):
@property
def common_values(self) -> pd.Series:
+ """
+ Returns the commonly repeated values in the source data
+ Usually this indicates that the source field is an enum or boolean
+ """
try:
return self._common_values
except AttributeError:
diff --git a/src/autoparser/make_toml.py b/src/autoparser/make_toml.py
index 1df1cee..a776edc 100644
--- a/src/autoparser/make_toml.py
+++ b/src/autoparser/make_toml.py
@@ -111,7 +111,8 @@ def __init__(
)
@property
- def parsed_choices(self):
+ def parsed_choices(self) -> pd.Series:
+ """Returns the mapped values for each taget field"""
try:
return self._parsed_choices
except AttributeError:
@@ -124,7 +125,8 @@ def _parse_choices(s: str):
return self._parsed_choices
@property
- def references_definitions(self):
+ def references_definitions(self) -> tuple[dict[str, str], dict[str, dict]]:
+ """Finds and returns the references and definitions for the mappings"""
try:
return self._references_definitions
except AttributeError:
@@ -158,11 +160,11 @@ def references_definitions(self):
return self._references_definitions
def schema_fields(self, table: str):
- "Returns all the fields for `table` and their properties"
+ """Returns all the fields for `table` and their properties"""
return self.schemas[table]["properties"]
- def single_field_mapping(self, match: pd.core.frame.pandas) -> dict[str, Any]:
- "Make a single field mapping from a single row of the mappings dataframe"
+ def single_field_mapping(self, match: pd.DataFrame) -> dict[str, Any]:
+ """Make a single field mapping from a single row of the mappings dataframe"""
choices = self.parsed_choices[match.target_field]
@@ -178,7 +180,7 @@ def single_field_mapping(self, match: pd.core.frame.pandas) -> dict[str, Any]:
return out
def make_toml_table(self, table: str) -> dict[str, Any]:
- "Make single TOML table from mappings"
+ """Make single TOML table from mappings"""
outmap = {}
@@ -289,10 +291,11 @@ def create_parser(
def main():
parser = argparse.ArgumentParser(
- description="Make TOML from intermediate CSV file created by create_mapping.py"
+ description="Make TOML from intermediate CSV file created by create_mapping.py",
+ prog="autoparser create-parser",
)
parser.add_argument("mappings", help="Mapping file to create TOML from", type=str)
- parser.add_argument("schema", help="Path where schemas are located")
+ parser.add_argument("schema_path", help="Path where schemas are located")
parser.add_argument(
"-n",
"--name",