From 48841121ec9386df8242d1be9d7b1d38ddd57fa1 Mon Sep 17 00:00:00 2001
From: Pip Liggins <philippa.liggins@dtc.ox.ac.uk>
Date: Thu, 21 Nov 2024 16:32:52 +0000
Subject: [PATCH] Improvements to documentation (#2)

Adds examples and API reference
---
 docs/api/create_mapping.md       |  16 +
 docs/api/dict_writer.md          |  20 +
 docs/api/index.md                |   9 +
 docs/api/make_toml.md            |  17 +
 docs/conf.py                     |   6 +
 docs/examples/cli_example.md     |  47 +++
 docs/examples/example.ipynb      | 645 +++++++++++++++++++++++++++++++
 docs/getting_started/index.md    |  23 +-
 docs/index.md                    |  39 +-
 docs/usage/data_dict.md          |   6 +-
 docs/usage/index.md              |   8 +
 docs/usage/parser_generation.md  |   6 +-
 src/autoparser/create_mapping.py |   7 +
 src/autoparser/make_toml.py      |  19 +-
 14 files changed, 844 insertions(+), 24 deletions(-)
 create mode 100644 docs/api/create_mapping.md
 create mode 100644 docs/api/dict_writer.md
 create mode 100644 docs/api/index.md
 create mode 100644 docs/api/make_toml.md
 create mode 100644 docs/examples/cli_example.md
 create mode 100644 docs/examples/example.ipynb
 create mode 100644 docs/usage/index.md

diff --git a/docs/api/create_mapping.md b/docs/api/create_mapping.md
new file mode 100644
index 0000000..452274a
--- /dev/null
+++ b/docs/api/create_mapping.md
@@ -0,0 +1,16 @@
+# Mapping Functions
+
+The following functions can be used to create the intermediate mapping CSV required to generate a parser
+
+```{eval-rst}
+.. autofunction:: autoparser.create_mapping
+```
+
+## Class definitions
+
+You can also interact with the base class `Mapper`
+
+```{eval-rst}
+.. autoclass:: autoparser.Mapper
+    :members:
+```
\ No newline at end of file
diff --git a/docs/api/dict_writer.md b/docs/api/dict_writer.md
new file mode 100644
index 0000000..15a5c5c
--- /dev/null
+++ b/docs/api/dict_writer.md
@@ -0,0 +1,20 @@
+# Data Dictionary Functions
+
+The following functions can be used to create and add descriptions to a data dictionary
+
+```{eval-rst}
+.. autofunction:: autoparser.create_dict
+    :noindex:
+
+.. autofunction:: autoparser.generate_descriptions
+    :noindex:
+```
+
+## Class definitions
+
+You can also interact with the base class `DictWriter`
+
+```{eval-rst}
+.. autoclass:: autoparser.DictWriter
+    :members:
+```
\ No newline at end of file
diff --git a/docs/api/index.md b/docs/api/index.md
new file mode 100644
index 0000000..4a0622e
--- /dev/null
+++ b/docs/api/index.md
@@ -0,0 +1,9 @@
+# API
+
+This section describes the public API for AutoParser
+
+```{toctree}
+dict_writer
+create_mapping
+make_toml
+```
\ No newline at end of file
diff --git a/docs/api/make_toml.md b/docs/api/make_toml.md
new file mode 100644
index 0000000..81b1fa6
--- /dev/null
+++ b/docs/api/make_toml.md
@@ -0,0 +1,17 @@
+# Parser Functions
+
+The following functions can be used to create the final TOML parser file
+
+```{eval-rst}
+.. autofunction:: autoparser.create_parser
+    :noindex:
+```
+
+## Class definitions
+
+You can also interact with the base class `parserGenerator`
+
+```{eval-rst}
+.. autoclass:: autoparser.ParserGenerator
+    :members:
+```
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index e3ae7ed..d1f3827 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -32,4 +32,10 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 html_theme = "sphinx_book_theme"
 html_logo = "images/logo.png"
+html_title = "AutoParser"
 html_static_path = ["_static"]
+
+html_theme_options = {
+    "repository_url": "https://github.com/globaldothealth/autoparser",
+    "use_repository_button": True,
+}
diff --git a/docs/examples/cli_example.md b/docs/examples/cli_example.md
new file mode 100644
index 0000000..eef22b4
--- /dev/null
+++ b/docs/examples/cli_example.md
@@ -0,0 +1,47 @@
+# CLI Parser construction
+
+This file describes how to run the same parser generation pipeline as described in the
+[parser construction](example) notebook, but using the command line interface. It 
+constructs a parser file for an `animals.csv` file of test data, and assumes all commands
+are run from the root of the `autoparser` package.
+
+Note: As a reminder, you will need an API key for OpenAI or Google. This example uses the OpenAI LLM.
+
+## Generate a data dictionary
+In this example, we will generate a data dictionary with descriptions already added in one step. The CLI command follows this syntax:
+
+
+```bash
+autoparser create-dict data language [-d] [-k api_key] [-l llm_choice] [-c config_file] [-o output_name]
+```
+so for the `animal_data.csv` data we will run this command to generate a data dictionary
+with descriptions
+
+```bash
+autoparser create-dict tests/sources/animal_data.csv "fr" -d -k $OPENAI_API_KEY -c tests/test_config.toml -o "animal_dd"
+```
+This creates an `animals_dd.csv` data dictionary to use in the next step.
+
+## Create intermediate mapping file
+The next step is to create an intermediate CSV for you to inspect, mapping the fields and values in the raw data to the target schema. This is the CLI syntax:
+
+```bash
+autoparser create-mapping dictionary schema language api_key [-l llm_choice] [-c config_file] [-o output_name]
+```
+so we can run
+```bash
+autoparser create-mapping animal_dd.csv tests/schemas/animals.schema.json "fr" $OPENAI_API_KEY -c tests/test_config.toml -o animal_mapping
+```
+to create the intermediate mapping file `animal_mapping.csv` for you to inspect for any errors.
+
+## Write the parser file
+Finally, the parser file for ADTL should be written out based on the contents of `animal_mapping.csv`. Once you've mande any changes to the mapping you want, we can use the `create_parser` command
+
+```bash
+autoparser create-parser mapping schema_path [-n parser_name] [--description parser_description] [-c config_file]
+```
+as
+```bash
+autoparser create-parser animal_mapping.csv tests/schemas -n animal_parser -c tests/test_config.toml
+```
+which writes out the TOML parser as `animal_parser.toml` ready for use in ADTL.
\ No newline at end of file
diff --git a/docs/examples/example.ipynb b/docs/examples/example.ipynb
new file mode 100644
index 0000000..e71f3fd
--- /dev/null
+++ b/docs/examples/example.ipynb
@@ -0,0 +1,645 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Parser construction example"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This file demonstrates the process of constructing a parser file using `animals.csv` as a source dataset.\n",
+    "\n",
+    "Before you start: `autoparser` requires an OpenAI API key to function. You should add yours to your environment, as described [here](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety). \n",
+    "Edit the `API_KEY` line below to match the name you gave yours."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import autoparser\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n",
+    "\n",
+    "# The path to the configuration file to use\n",
+    "config_path = \"../../tests/test_config.toml\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Identité</th>\n",
+       "      <th>Province</th>\n",
+       "      <th>DateNotification</th>\n",
+       "      <th>Classicfication</th>\n",
+       "      <th>Nom complet</th>\n",
+       "      <th>Date de naissance</th>\n",
+       "      <th>AgeAns</th>\n",
+       "      <th>AgeMois</th>\n",
+       "      <th>Sexe</th>\n",
+       "      <th>StatusCas</th>\n",
+       "      <th>DateDec</th>\n",
+       "      <th>ContSoins</th>\n",
+       "      <th>ContHumain Autre</th>\n",
+       "      <th>AutreContHumain</th>\n",
+       "      <th>ContactAnimal</th>\n",
+       "      <th>Micropucé</th>\n",
+       "      <th>AnimalDeCompagnie</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>A001</td>\n",
+       "      <td>Equateur</td>\n",
+       "      <td>2024-01-01</td>\n",
+       "      <td>Mammifère</td>\n",
+       "      <td>Luna</td>\n",
+       "      <td>15/03/2022</td>\n",
+       "      <td>2</td>\n",
+       "      <td>10</td>\n",
+       "      <td>f</td>\n",
+       "      <td>Vivant</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Non</td>\n",
+       "      <td>Non</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Oui</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>B002</td>\n",
+       "      <td>Equateur</td>\n",
+       "      <td>2024-15-02</td>\n",
+       "      <td>FISH</td>\n",
+       "      <td>Max</td>\n",
+       "      <td>21/07/2021</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>m</td>\n",
+       "      <td>Décédé</td>\n",
+       "      <td>2024-06-01</td>\n",
+       "      <td>Non</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Voyage</td>\n",
+       "      <td>Non</td>\n",
+       "      <td>NON</td>\n",
+       "      <td>Oui</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>C003</td>\n",
+       "      <td>Equateur</td>\n",
+       "      <td>2024-03-10</td>\n",
+       "      <td>oiseau</td>\n",
+       "      <td>Coco</td>\n",
+       "      <td>10/02/2023</td>\n",
+       "      <td>1</td>\n",
+       "      <td>11</td>\n",
+       "      <td>F</td>\n",
+       "      <td>Vivant</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Non</td>\n",
+       "      <td>Non</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Non</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>D004</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2024-04-22</td>\n",
+       "      <td>amphibie</td>\n",
+       "      <td>Bella</td>\n",
+       "      <td>05/11/2020</td>\n",
+       "      <td>4</td>\n",
+       "      <td>5</td>\n",
+       "      <td>m</td>\n",
+       "      <td>Vivant</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Autres</td>\n",
+       "      <td>Non</td>\n",
+       "      <td>NON</td>\n",
+       "      <td>Non</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>E005</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2024-05-30</td>\n",
+       "      <td>poisson</td>\n",
+       "      <td>Charlie</td>\n",
+       "      <td>18/05/2019</td>\n",
+       "      <td>5</td>\n",
+       "      <td>3</td>\n",
+       "      <td>F</td>\n",
+       "      <td>Décédé</td>\n",
+       "      <td>2024-07-01</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Voyage</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Oui</td>\n",
+       "      <td>Oui</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  Identité  Province DateNotification Classicfication  Nom complet   \\\n",
+       "0     A001  Equateur       2024-01-01        Mammifère         Luna   \n",
+       "1     B002  Equateur       2024-15-02             FISH          Max   \n",
+       "2     C003  Equateur       2024-03-10           oiseau         Coco   \n",
+       "3     D004       NaN       2024-04-22         amphibie        Bella   \n",
+       "4     E005       NaN       2024-05-30          poisson      Charlie   \n",
+       "\n",
+       "  Date de naissance  AgeAns  AgeMois          Sexe StatusCas     DateDec  \\\n",
+       "0        15/03/2022       2                10    f    Vivant         NaN   \n",
+       "1        21/07/2021       3                 4    m    Décédé  2024-06-01   \n",
+       "2        10/02/2023       1                11    F    Vivant         NaN   \n",
+       "3        05/11/2020       4                 5    m    Vivant         NaN   \n",
+       "4        18/05/2019       5                 3    F    Décédé  2024-07-01   \n",
+       "\n",
+       "  ContSoins  ContHumain Autre AutreContHumain ContactAnimal Micropucé  \\\n",
+       "0        Oui              Non             Non           Oui       Oui   \n",
+       "1        Non              Oui          Voyage           Non       NON   \n",
+       "2        Oui              Non             Non           Oui       Oui   \n",
+       "3        Oui              NaN          Autres           Non       NON   \n",
+       "4        NaN              NaN          Voyage           Oui       Oui   \n",
+       "\n",
+       "  AnimalDeCompagnie  \n",
+       "0               Oui  \n",
+       "1               Oui  \n",
+       "2               Non  \n",
+       "3               Non  \n",
+       "4               Oui  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = pd.read_csv(\"../../tests/sources/animal_data.csv\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's generate a basic data dictionary from this data set. We want to use the configuration file set up for this dataset, located in the `tests` directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Field Name</th>\n",
+       "      <th>Description</th>\n",
+       "      <th>Field Type</th>\n",
+       "      <th>Common Values</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Identité</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>string</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Province</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>choice</td>\n",
+       "      <td>Equateur, Orientale, Katanga, Kinshasa</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>DateNotification</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>string</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Classicfication</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>choice</td>\n",
+       "      <td>FISH, amphibie, oiseau, Mammifère, poisson, RE...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Nom complet</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>string</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         Field Name  Description Field Type  \\\n",
+       "0          Identité          NaN     string   \n",
+       "1          Province          NaN     choice   \n",
+       "2  DateNotification          NaN     string   \n",
+       "3  Classicfication           NaN     choice   \n",
+       "4      Nom complet           NaN     string   \n",
+       "\n",
+       "                                       Common Values  \n",
+       "0                                                NaN  \n",
+       "1             Equateur, Orientale, Katanga, Kinshasa  \n",
+       "2                                                NaN  \n",
+       "3  FISH, amphibie, oiseau, Mammifère, poisson, RE...  \n",
+       "4                                                NaN  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "writer = autoparser.DictWriter(config_path)\n",
+    "data_dict = writer.create_dict(data)\n",
+    "data_dict.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The 'Common Values' column indicates fields where there are a limited number of unique values, suggesting mapping to a controlled terminology may have been done, or might be required in the parser. The list of common values is every unique value in the field.\n",
+    "\n",
+    "Notice that the Description column is empty. To proceed to the next step of the parser generation process, creating the mapping file linking source -> schema fields, this column must be filled. You can either do this by hand (the descriptions MUST be in english), or use autoparser's LLM functionality to do it for you, demonstrated below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source_field</th>\n",
+       "      <th>source_description</th>\n",
+       "      <th>source_type</th>\n",
+       "      <th>common_values</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Identité</td>\n",
+       "      <td>Identity</td>\n",
+       "      <td>string</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Province</td>\n",
+       "      <td>Province</td>\n",
+       "      <td>choice</td>\n",
+       "      <td>Equateur, Orientale, Katanga, Kinshasa</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>DateNotification</td>\n",
+       "      <td>Notification Date</td>\n",
+       "      <td>string</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Classicfication</td>\n",
+       "      <td>Classification</td>\n",
+       "      <td>choice</td>\n",
+       "      <td>FISH, amphibie, oiseau, Mammifère, poisson, RE...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Nom complet</td>\n",
+       "      <td>Full Name</td>\n",
+       "      <td>string</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       source_field source_description source_type  \\\n",
+       "0          Identité           Identity      string   \n",
+       "1          Province           Province      choice   \n",
+       "2  DateNotification  Notification Date      string   \n",
+       "3  Classicfication      Classification      choice   \n",
+       "4      Nom complet           Full Name      string   \n",
+       "\n",
+       "                                       common_values  \n",
+       "0                                                NaN  \n",
+       "1             Equateur, Orientale, Katanga, Kinshasa  \n",
+       "2                                                NaN  \n",
+       "3  FISH, amphibie, oiseau, Mammifère, poisson, RE...  \n",
+       "4                                                NaN  "
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dd_described = writer.generate_descriptions(\"fr\", data_dict, key=API_KEY)\n",
+    "dd_described.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now that we have a data dictionary with descriptions added, we can proceed to creating an intermediate mapping file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/pipliggins/Documents/repos/autoparser/src/autoparser/create_mapping.py:258: UserWarning: The following schema fields have not been mapped: ['country_iso3', 'owner']\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>source_description</th>\n",
+       "      <th>source_field</th>\n",
+       "      <th>common_values</th>\n",
+       "      <th>target_values</th>\n",
+       "      <th>value_mapping</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>target_field</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>identity</th>\n",
+       "      <td>Identity</td>\n",
+       "      <td>Identité</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>name</th>\n",
+       "      <td>Full Name</td>\n",
+       "      <td>Nom complet</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>loc_admin_1</th>\n",
+       "      <td>Province</td>\n",
+       "      <td>Province</td>\n",
+       "      <td>Equateur, Orientale, Katanga, Kinshasa</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>equateur=None, kinshasa=None, katanga=None, or...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>country_iso3</th>\n",
+       "      <td>None</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>notification_date</th>\n",
+       "      <td>Notification Date</td>\n",
+       "      <td>DateNotification</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  source_description      source_field  \\\n",
+       "target_field                                             \n",
+       "identity                    Identity          Identité   \n",
+       "name                       Full Name      Nom complet    \n",
+       "loc_admin_1                 Province          Province   \n",
+       "country_iso3                    None               NaN   \n",
+       "notification_date  Notification Date  DateNotification   \n",
+       "\n",
+       "                                            common_values target_values  \\\n",
+       "target_field                                                              \n",
+       "identity                                              NaN           NaN   \n",
+       "name                                                  NaN           NaN   \n",
+       "loc_admin_1        Equateur, Orientale, Katanga, Kinshasa           NaN   \n",
+       "country_iso3                                          NaN           NaN   \n",
+       "notification_date                                     NaN           NaN   \n",
+       "\n",
+       "                                                       value_mapping  \n",
+       "target_field                                                          \n",
+       "identity                                                         NaN  \n",
+       "name                                                             NaN  \n",
+       "loc_admin_1        equateur=None, kinshasa=None, katanga=None, or...  \n",
+       "country_iso3                                                     NaN  \n",
+       "notification_date                                                NaN  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "mapper = autoparser.Mapper(\"../../tests/schemas/animals.schema.json\", dd_described, \"fr\", api_key=API_KEY, config=config_path)\n",
+    "mapping_dict = mapper.create_mapping(file_name='example_mapping.csv')\n",
+    "\n",
+    "mapping_dict.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "At this point, you should inspect the mapping file and look for fields/values that have been incorrectly mapped, and edit them where necessary.\n",
+    "The mapping file has been written out to [example_mapping.csv](example_mapping.csv). A good example is the 'loc_admin_1' field; the LLM often maps the common values provided to 'None' as the schema denotes this as a free-text field. Instead, delete these mapped values and the parsed data will contain the original free text.\n",
+    "Also note the warning above; the LLM should not have found fields to map to the 'country_iso3' or 'owner' fields. If the original data did contain an appropriate field for these, you should edit the mapping file accordingly."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Once you have edited the mapping file to your satisfaction, we can go ahead and create the TOML parser file, `example_parser.toml`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:root:Missing required field country_iso3 in animals schema. Adding empty field...\n"
+     ]
+    }
+   ],
+   "source": [
+    "writer = autoparser.ParserGenerator(\"example_mapping.csv\", \"../../tests/schemas\", \"example\", config=config_path)\n",
+    "writer.create_parser(\"example_parser.toml\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can veiw/edit the created parser at [example_parser.toml](example_parser.toml), and try it out using [ADTL](https://github.com/globaldothealth/adtl)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md
index 54ad793..ac9ebff 100644
--- a/docs/getting_started/index.md
+++ b/docs/getting_started/index.md
@@ -16,4 +16,25 @@ uv sync
 . .venv/bin/activate
 ```
 
-To view and use the CLI, you can type `autoparser` into the command line to view the options available.
+To view and use the CLI, you can type `autoparser` into the command line to view the
+options available.
+
+## Other requirements
+
+AutoParser relies on LLMs to automatically map raw data fields to a target schema.
+In order to use this tool, you will need an API key for either [OpenAI](https://platform.openai.com/docs/quickstart/create-and-export-an-api-key)
+or Google's [Gemini](https://aistudio.google.com/apikey) [Dev note: work in progress!].
+AutoParser will use either OpenAI's `gpt-4-mini`, or Google's `gemini-1.5-flash`.
+
+The LLM should *never* see your raw data; only the data dictionary which contains 
+column headers, and text descriptions of what each field shoud contain.
+
+### Supported file formats
+Autoparser supports CSV and XLSX formats for raw data and data dictionary files, and either
+JSON or TOML for the target schema.
+
+## Quickstart
+
+See the example notebook [here](../examples/example.ipynb) for a basic walk through the
+functionality of AutoParser.
+
diff --git a/docs/index.md b/docs/index.md
index 3a3f748..b1093e9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -2,10 +2,7 @@
 AutoParser is a tool for semi-automated data parser creation. The package allows you
 to generate a new data parser for converting your source data into a new format specified
 using a schema file, ready to use with the data transformation tool [adtl](https://adtl.readthedocs.io/en/latest/index.html).
-
-## Key Features
-- Data Dictionary Creation: Automatically create a basic data dictionary framework
-- Parser Generation: Generate data parsers to match a given schema
+Both programmatic and CLI usage are provided for. 
 
 ## Framework
 
@@ -15,13 +12,39 @@ Flowchart showing the inputs (bright blue), outputs (green blocks) and functions
 ```
 
 ## Documentation
+
+how to get started with AutoParser, and describes the API
+
 ```{toctree}
 ---
 maxdepth: 2
-caption: Contents:
+caption: Documentation
 ---
-self
 getting_started/index
-usage/data_dict
-usage/parser_generation
+usage/index
+```
+
+## Example pages
+
+Examples demonstrating usage of this tool
+
+```{toctree}
+---
+maxdepth: 2
+caption: Examples
+---
+examples/example
+examples/cli_example
+```
+
+## API reference
+
+Documents the full API
+
+```{toctree}
+---
+maxdepth: 2
+caption: API
+---
+api/index
 ```
\ No newline at end of file
diff --git a/docs/usage/data_dict.md b/docs/usage/data_dict.md
index 56b94fb..92b70d6 100644
--- a/docs/usage/data_dict.md
+++ b/docs/usage/data_dict.md
@@ -7,13 +7,13 @@ It should contain, at minimum, a list of field/column names, and some kind of de
 of what data each field holds. This often takes the form of a textual description, plus
 a note of the data type (text, decimals, date, boolean...) and/or a set of expected values.
 
-A data dictionary is required by AutoParser for (parser generation)[parser_generation].
+A data dictionary is required by AutoParser for [parser generation](parser_generation).
 This is to avoid having to send potentially sensitive or confidential data to an external
 body (in this case an externally hosted LLM hosted); instead a *decription* of what the
 data looks like from the dictionary can be sent to the LLM, which allows for mapping to
 occur without risking the unintentional release of data.
 
-Many data capture services such as (REDCaP)[https://projectredcap.org/] will generate
+Many data capture services such as [REDCaP](https://projectredcap.org/) will generate
 a data dictionary automatically when surveys are set up. However, where data is being
 captured either rapidly, or by individuals/small teams, a formal data dictionary may not
 have been created for a corresponding dataset. For this scenario, AutoParser provides 
@@ -54,10 +54,8 @@ to generate a data parser.
 
 ```{eval-rst}
 .. autofunction:: autoparser.create_dict
-    :noindex:
 
 .. autofunction:: autoparser.generate_descriptions
-    :noindex:
 ```
 
 
diff --git a/docs/usage/index.md b/docs/usage/index.md
new file mode 100644
index 0000000..7b7293b
--- /dev/null
+++ b/docs/usage/index.md
@@ -0,0 +1,8 @@
+# Usage
+
+These sections describe the key usage patterns for AutoParser.
+
+```{toctree}
+data_dict
+parser_generation
+```
\ No newline at end of file
diff --git a/docs/usage/parser_generation.md b/docs/usage/parser_generation.md
index e5b17a8..e4a97ba 100644
--- a/docs/usage/parser_generation.md
+++ b/docs/usage/parser_generation.md
@@ -1,13 +1,13 @@
 # Write a Data Parser
 
-AutoParser assumes the use of Global.Health's (adtl)[https://github.com/globaldothealth/adtl]
+AutoParser assumes the use of Global.Health's [adtl](https://github.com/globaldothealth/adtl)
 package to transform your source data into a standardised format. To do this, adtl requires a
-(TOML)[https://toml.io/en/] specification file which describes how raw data should be
+[TOML](https://toml.io/en/) specification file which describes how raw data should be
 converted into the new format, on a field-by-field basis. Every unique data file format
 (i.e. unique sets of fields and data types) should have a corresponding parser file.
 
 AutoParser exists to semi-automate the process of writing new parser files. This requires
-a data dictionary (which can be created if it does not already exist, see [data_dict]),
+a data dictionary (which can be created if it does not already exist, see '[Create Data dictionary](data_dict)'),
 and the JSON schema of the target format.
 
 Parser generation is a 2-step process. 
diff --git a/src/autoparser/create_mapping.py b/src/autoparser/create_mapping.py
index 3c4ea93..ab2e7be 100644
--- a/src/autoparser/create_mapping.py
+++ b/src/autoparser/create_mapping.py
@@ -67,6 +67,7 @@ def __init__(
 
     @property
     def target_fields(self) -> list[str]:
+        """Returns a list of fields in the target schema"""
         try:
             return self._target_fields
         except AttributeError:
@@ -75,6 +76,7 @@ def target_fields(self) -> list[str]:
 
     @property
     def target_types(self) -> dict[str, list[str]]:
+        """Returns the field types of the target schema"""
         try:
             return self._target_types
         except AttributeError:
@@ -86,6 +88,7 @@ def target_types(self) -> dict[str, list[str]]:
 
     @property
     def target_values(self) -> pd.Series:
+        """Returns the enum values or boolean options for the target schema"""
         try:
             return self._target_values
         except AttributeError:
@@ -106,6 +109,10 @@ def _value_options(f):
 
     @property
     def common_values(self) -> pd.Series:
+        """
+        Returns the commonly repeated values in the source data
+        Usually this indicates that the source field is an enum or boolean
+        """
         try:
             return self._common_values
         except AttributeError:
diff --git a/src/autoparser/make_toml.py b/src/autoparser/make_toml.py
index 1df1cee..a776edc 100644
--- a/src/autoparser/make_toml.py
+++ b/src/autoparser/make_toml.py
@@ -111,7 +111,8 @@ def __init__(
             )
 
     @property
-    def parsed_choices(self):
+    def parsed_choices(self) -> pd.Series:
+        """Returns the mapped values for each taget field"""
         try:
             return self._parsed_choices
         except AttributeError:
@@ -124,7 +125,8 @@ def _parse_choices(s: str):
             return self._parsed_choices
 
     @property
-    def references_definitions(self):
+    def references_definitions(self) -> tuple[dict[str, str], dict[str, dict]]:
+        """Finds and returns the references and definitions for the mappings"""
         try:
             return self._references_definitions
         except AttributeError:
@@ -158,11 +160,11 @@ def references_definitions(self):
             return self._references_definitions
 
     def schema_fields(self, table: str):
-        "Returns all the fields for `table` and their properties"
+        """Returns all the fields for `table` and their properties"""
         return self.schemas[table]["properties"]
 
-    def single_field_mapping(self, match: pd.core.frame.pandas) -> dict[str, Any]:
-        "Make a single field mapping from a single row of the mappings dataframe"
+    def single_field_mapping(self, match: pd.DataFrame) -> dict[str, Any]:
+        """Make a single field mapping from a single row of the mappings dataframe"""
 
         choices = self.parsed_choices[match.target_field]
 
@@ -178,7 +180,7 @@ def single_field_mapping(self, match: pd.core.frame.pandas) -> dict[str, Any]:
         return out
 
     def make_toml_table(self, table: str) -> dict[str, Any]:
-        "Make single TOML table from mappings"
+        """Make single TOML table from mappings"""
 
         outmap = {}
 
@@ -289,10 +291,11 @@ def create_parser(
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Make TOML from intermediate CSV file created by create_mapping.py"
+        description="Make TOML from intermediate CSV file created by create_mapping.py",
+        prog="autoparser create-parser",
     )
     parser.add_argument("mappings", help="Mapping file to create TOML from", type=str)
-    parser.add_argument("schema", help="Path where schemas are located")
+    parser.add_argument("schema_path", help="Path where schemas are located")
     parser.add_argument(
         "-n",
         "--name",