From 48841121ec9386df8242d1be9d7b1d38ddd57fa1 Mon Sep 17 00:00:00 2001 From: Pip Liggins Date: Thu, 21 Nov 2024 16:32:52 +0000 Subject: [PATCH] Improvements to documentation (#2) Adds examples and API reference --- docs/api/create_mapping.md | 16 + docs/api/dict_writer.md | 20 + docs/api/index.md | 9 + docs/api/make_toml.md | 17 + docs/conf.py | 6 + docs/examples/cli_example.md | 47 +++ docs/examples/example.ipynb | 645 +++++++++++++++++++++++++++++++ docs/getting_started/index.md | 23 +- docs/index.md | 39 +- docs/usage/data_dict.md | 6 +- docs/usage/index.md | 8 + docs/usage/parser_generation.md | 6 +- src/autoparser/create_mapping.py | 7 + src/autoparser/make_toml.py | 19 +- 14 files changed, 844 insertions(+), 24 deletions(-) create mode 100644 docs/api/create_mapping.md create mode 100644 docs/api/dict_writer.md create mode 100644 docs/api/index.md create mode 100644 docs/api/make_toml.md create mode 100644 docs/examples/cli_example.md create mode 100644 docs/examples/example.ipynb create mode 100644 docs/usage/index.md diff --git a/docs/api/create_mapping.md b/docs/api/create_mapping.md new file mode 100644 index 0000000..452274a --- /dev/null +++ b/docs/api/create_mapping.md @@ -0,0 +1,16 @@ +# Mapping Functions + +The following functions can be used to create the intermediate mapping CSV required to generate a parser + +```{eval-rst} +.. autofunction:: autoparser.create_mapping +``` + +## Class definitions + +You can also interact with the base class `Mapper` + +```{eval-rst} +.. autoclass:: autoparser.Mapper + :members: +``` \ No newline at end of file diff --git a/docs/api/dict_writer.md b/docs/api/dict_writer.md new file mode 100644 index 0000000..15a5c5c --- /dev/null +++ b/docs/api/dict_writer.md @@ -0,0 +1,20 @@ +# Data Dictionary Functions + +The following functions can be used to create and add descriptions to a data dictionary + +```{eval-rst} +.. autofunction:: autoparser.create_dict + :noindex: + +.. autofunction:: autoparser.generate_descriptions + :noindex: +``` + +## Class definitions + +You can also interact with the base class `DictWriter` + +```{eval-rst} +.. autoclass:: autoparser.DictWriter + :members: +``` \ No newline at end of file diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..4a0622e --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,9 @@ +# API + +This section describes the public API for AutoParser + +```{toctree} +dict_writer +create_mapping +make_toml +``` \ No newline at end of file diff --git a/docs/api/make_toml.md b/docs/api/make_toml.md new file mode 100644 index 0000000..81b1fa6 --- /dev/null +++ b/docs/api/make_toml.md @@ -0,0 +1,17 @@ +# Parser Functions + +The following functions can be used to create the final TOML parser file + +```{eval-rst} +.. autofunction:: autoparser.create_parser + :noindex: +``` + +## Class definitions + +You can also interact with the base class `parserGenerator` + +```{eval-rst} +.. autoclass:: autoparser.ParserGenerator + :members: +``` \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index e3ae7ed..d1f3827 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,4 +32,10 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output html_theme = "sphinx_book_theme" html_logo = "images/logo.png" +html_title = "AutoParser" html_static_path = ["_static"] + +html_theme_options = { + "repository_url": "https://github.com/globaldothealth/autoparser", + "use_repository_button": True, +} diff --git a/docs/examples/cli_example.md b/docs/examples/cli_example.md new file mode 100644 index 0000000..eef22b4 --- /dev/null +++ b/docs/examples/cli_example.md @@ -0,0 +1,47 @@ +# CLI Parser construction + +This file describes how to run the same parser generation pipeline as described in the +[parser construction](example) notebook, but using the command line interface. It +constructs a parser file for an `animals.csv` file of test data, and assumes all commands +are run from the root of the `autoparser` package. + +Note: As a reminder, you will need an API key for OpenAI or Google. This example uses the OpenAI LLM. + +## Generate a data dictionary +In this example, we will generate a data dictionary with descriptions already added in one step. The CLI command follows this syntax: + + +```bash +autoparser create-dict data language [-d] [-k api_key] [-l llm_choice] [-c config_file] [-o output_name] +``` +so for the `animal_data.csv` data we will run this command to generate a data dictionary +with descriptions + +```bash +autoparser create-dict tests/sources/animal_data.csv "fr" -d -k $OPENAI_API_KEY -c tests/test_config.toml -o "animal_dd" +``` +This creates an `animals_dd.csv` data dictionary to use in the next step. + +## Create intermediate mapping file +The next step is to create an intermediate CSV for you to inspect, mapping the fields and values in the raw data to the target schema. This is the CLI syntax: + +```bash +autoparser create-mapping dictionary schema language api_key [-l llm_choice] [-c config_file] [-o output_name] +``` +so we can run +```bash +autoparser create-mapping animal_dd.csv tests/schemas/animals.schema.json "fr" $OPENAI_API_KEY -c tests/test_config.toml -o animal_mapping +``` +to create the intermediate mapping file `animal_mapping.csv` for you to inspect for any errors. + +## Write the parser file +Finally, the parser file for ADTL should be written out based on the contents of `animal_mapping.csv`. Once you've mande any changes to the mapping you want, we can use the `create_parser` command + +```bash +autoparser create-parser mapping schema_path [-n parser_name] [--description parser_description] [-c config_file] +``` +as +```bash +autoparser create-parser animal_mapping.csv tests/schemas -n animal_parser -c tests/test_config.toml +``` +which writes out the TOML parser as `animal_parser.toml` ready for use in ADTL. \ No newline at end of file diff --git a/docs/examples/example.ipynb b/docs/examples/example.ipynb new file mode 100644 index 0000000..e71f3fd --- /dev/null +++ b/docs/examples/example.ipynb @@ -0,0 +1,645 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Parser construction example" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This file demonstrates the process of constructing a parser file using `animals.csv` as a source dataset.\n", + "\n", + "Before you start: `autoparser` requires an OpenAI API key to function. You should add yours to your environment, as described [here](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety). \n", + "Edit the `API_KEY` line below to match the name you gave yours." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import autoparser\n", + "import pandas as pd\n", + "import os\n", + "API_KEY = os.environ.get(\"OPENAI_API_KEY\")\n", + "\n", + "# The path to the configuration file to use\n", + "config_path = \"../../tests/test_config.toml\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IdentitéProvinceDateNotificationClassicficationNom completDate de naissanceAgeAnsAgeMoisSexeStatusCasDateDecContSoinsContHumain AutreAutreContHumainContactAnimalMicropucéAnimalDeCompagnie
0A001Equateur2024-01-01MammifèreLuna15/03/2022210fVivantNaNOuiNonNonOuiOuiOui
1B002Equateur2024-15-02FISHMax21/07/202134mDécédé2024-06-01NonOuiVoyageNonNONOui
2C003Equateur2024-03-10oiseauCoco10/02/2023111FVivantNaNOuiNonNonOuiOuiNon
3D004NaN2024-04-22amphibieBella05/11/202045mVivantNaNOuiNaNAutresNonNONNon
4E005NaN2024-05-30poissonCharlie18/05/201953FDécédé2024-07-01NaNNaNVoyageOuiOuiOui
\n", + "
" + ], + "text/plain": [ + " Identité Province DateNotification Classicfication Nom complet \\\n", + "0 A001 Equateur 2024-01-01 Mammifère Luna \n", + "1 B002 Equateur 2024-15-02 FISH Max \n", + "2 C003 Equateur 2024-03-10 oiseau Coco \n", + "3 D004 NaN 2024-04-22 amphibie Bella \n", + "4 E005 NaN 2024-05-30 poisson Charlie \n", + "\n", + " Date de naissance AgeAns AgeMois Sexe StatusCas DateDec \\\n", + "0 15/03/2022 2 10 f Vivant NaN \n", + "1 21/07/2021 3 4 m Décédé 2024-06-01 \n", + "2 10/02/2023 1 11 F Vivant NaN \n", + "3 05/11/2020 4 5 m Vivant NaN \n", + "4 18/05/2019 5 3 F Décédé 2024-07-01 \n", + "\n", + " ContSoins ContHumain Autre AutreContHumain ContactAnimal Micropucé \\\n", + "0 Oui Non Non Oui Oui \n", + "1 Non Oui Voyage Non NON \n", + "2 Oui Non Non Oui Oui \n", + "3 Oui NaN Autres Non NON \n", + "4 NaN NaN Voyage Oui Oui \n", + "\n", + " AnimalDeCompagnie \n", + "0 Oui \n", + "1 Oui \n", + "2 Non \n", + "3 Non \n", + "4 Oui " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data = pd.read_csv(\"../../tests/sources/animal_data.csv\")\n", + "data.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's generate a basic data dictionary from this data set. We want to use the configuration file set up for this dataset, located in the `tests` directory." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Field NameDescriptionField TypeCommon Values
0IdentitéNaNstringNaN
1ProvinceNaNchoiceEquateur, Orientale, Katanga, Kinshasa
2DateNotificationNaNstringNaN
3ClassicficationNaNchoiceFISH, amphibie, oiseau, Mammifère, poisson, RE...
4Nom completNaNstringNaN
\n", + "
" + ], + "text/plain": [ + " Field Name Description Field Type \\\n", + "0 Identité NaN string \n", + "1 Province NaN choice \n", + "2 DateNotification NaN string \n", + "3 Classicfication NaN choice \n", + "4 Nom complet NaN string \n", + "\n", + " Common Values \n", + "0 NaN \n", + "1 Equateur, Orientale, Katanga, Kinshasa \n", + "2 NaN \n", + "3 FISH, amphibie, oiseau, Mammifère, poisson, RE... \n", + "4 NaN " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "writer = autoparser.DictWriter(config_path)\n", + "data_dict = writer.create_dict(data)\n", + "data_dict.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The 'Common Values' column indicates fields where there are a limited number of unique values, suggesting mapping to a controlled terminology may have been done, or might be required in the parser. The list of common values is every unique value in the field.\n", + "\n", + "Notice that the Description column is empty. To proceed to the next step of the parser generation process, creating the mapping file linking source -> schema fields, this column must be filled. You can either do this by hand (the descriptions MUST be in english), or use autoparser's LLM functionality to do it for you, demonstrated below." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
source_fieldsource_descriptionsource_typecommon_values
0IdentitéIdentitystringNaN
1ProvinceProvincechoiceEquateur, Orientale, Katanga, Kinshasa
2DateNotificationNotification DatestringNaN
3ClassicficationClassificationchoiceFISH, amphibie, oiseau, Mammifère, poisson, RE...
4Nom completFull NamestringNaN
\n", + "
" + ], + "text/plain": [ + " source_field source_description source_type \\\n", + "0 Identité Identity string \n", + "1 Province Province choice \n", + "2 DateNotification Notification Date string \n", + "3 Classicfication Classification choice \n", + "4 Nom complet Full Name string \n", + "\n", + " common_values \n", + "0 NaN \n", + "1 Equateur, Orientale, Katanga, Kinshasa \n", + "2 NaN \n", + "3 FISH, amphibie, oiseau, Mammifère, poisson, RE... \n", + "4 NaN " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dd_described = writer.generate_descriptions(\"fr\", data_dict, key=API_KEY)\n", + "dd_described.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have a data dictionary with descriptions added, we can proceed to creating an intermediate mapping file:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/pipliggins/Documents/repos/autoparser/src/autoparser/create_mapping.py:258: UserWarning: The following schema fields have not been mapped: ['country_iso3', 'owner']\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
source_descriptionsource_fieldcommon_valuestarget_valuesvalue_mapping
target_field
identityIdentityIdentitéNaNNaNNaN
nameFull NameNom completNaNNaNNaN
loc_admin_1ProvinceProvinceEquateur, Orientale, Katanga, KinshasaNaNequateur=None, kinshasa=None, katanga=None, or...
country_iso3NoneNaNNaNNaNNaN
notification_dateNotification DateDateNotificationNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " source_description source_field \\\n", + "target_field \n", + "identity Identity Identité \n", + "name Full Name Nom complet \n", + "loc_admin_1 Province Province \n", + "country_iso3 None NaN \n", + "notification_date Notification Date DateNotification \n", + "\n", + " common_values target_values \\\n", + "target_field \n", + "identity NaN NaN \n", + "name NaN NaN \n", + "loc_admin_1 Equateur, Orientale, Katanga, Kinshasa NaN \n", + "country_iso3 NaN NaN \n", + "notification_date NaN NaN \n", + "\n", + " value_mapping \n", + "target_field \n", + "identity NaN \n", + "name NaN \n", + "loc_admin_1 equateur=None, kinshasa=None, katanga=None, or... \n", + "country_iso3 NaN \n", + "notification_date NaN " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mapper = autoparser.Mapper(\"../../tests/schemas/animals.schema.json\", dd_described, \"fr\", api_key=API_KEY, config=config_path)\n", + "mapping_dict = mapper.create_mapping(file_name='example_mapping.csv')\n", + "\n", + "mapping_dict.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "At this point, you should inspect the mapping file and look for fields/values that have been incorrectly mapped, and edit them where necessary.\n", + "The mapping file has been written out to [example_mapping.csv](example_mapping.csv). A good example is the 'loc_admin_1' field; the LLM often maps the common values provided to 'None' as the schema denotes this as a free-text field. Instead, delete these mapped values and the parsed data will contain the original free text.\n", + "Also note the warning above; the LLM should not have found fields to map to the 'country_iso3' or 'owner' fields. If the original data did contain an appropriate field for these, you should edit the mapping file accordingly." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have edited the mapping file to your satisfaction, we can go ahead and create the TOML parser file, `example_parser.toml`:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:root:Missing required field country_iso3 in animals schema. Adding empty field...\n" + ] + } + ], + "source": [ + "writer = autoparser.ParserGenerator(\"example_mapping.csv\", \"../../tests/schemas\", \"example\", config=config_path)\n", + "writer.create_parser(\"example_parser.toml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can veiw/edit the created parser at [example_parser.toml](example_parser.toml), and try it out using [ADTL](https://github.com/globaldothealth/adtl)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md index 54ad793..ac9ebff 100644 --- a/docs/getting_started/index.md +++ b/docs/getting_started/index.md @@ -16,4 +16,25 @@ uv sync . .venv/bin/activate ``` -To view and use the CLI, you can type `autoparser` into the command line to view the options available. +To view and use the CLI, you can type `autoparser` into the command line to view the +options available. + +## Other requirements + +AutoParser relies on LLMs to automatically map raw data fields to a target schema. +In order to use this tool, you will need an API key for either [OpenAI](https://platform.openai.com/docs/quickstart/create-and-export-an-api-key) +or Google's [Gemini](https://aistudio.google.com/apikey) [Dev note: work in progress!]. +AutoParser will use either OpenAI's `gpt-4-mini`, or Google's `gemini-1.5-flash`. + +The LLM should *never* see your raw data; only the data dictionary which contains +column headers, and text descriptions of what each field shoud contain. + +### Supported file formats +Autoparser supports CSV and XLSX formats for raw data and data dictionary files, and either +JSON or TOML for the target schema. + +## Quickstart + +See the example notebook [here](../examples/example.ipynb) for a basic walk through the +functionality of AutoParser. + diff --git a/docs/index.md b/docs/index.md index 3a3f748..b1093e9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,10 +2,7 @@ AutoParser is a tool for semi-automated data parser creation. The package allows you to generate a new data parser for converting your source data into a new format specified using a schema file, ready to use with the data transformation tool [adtl](https://adtl.readthedocs.io/en/latest/index.html). - -## Key Features -- Data Dictionary Creation: Automatically create a basic data dictionary framework -- Parser Generation: Generate data parsers to match a given schema +Both programmatic and CLI usage are provided for. ## Framework @@ -15,13 +12,39 @@ Flowchart showing the inputs (bright blue), outputs (green blocks) and functions ``` ## Documentation + +how to get started with AutoParser, and describes the API + ```{toctree} --- maxdepth: 2 -caption: Contents: +caption: Documentation --- -self getting_started/index -usage/data_dict -usage/parser_generation +usage/index +``` + +## Example pages + +Examples demonstrating usage of this tool + +```{toctree} +--- +maxdepth: 2 +caption: Examples +--- +examples/example +examples/cli_example +``` + +## API reference + +Documents the full API + +```{toctree} +--- +maxdepth: 2 +caption: API +--- +api/index ``` \ No newline at end of file diff --git a/docs/usage/data_dict.md b/docs/usage/data_dict.md index 56b94fb..92b70d6 100644 --- a/docs/usage/data_dict.md +++ b/docs/usage/data_dict.md @@ -7,13 +7,13 @@ It should contain, at minimum, a list of field/column names, and some kind of de of what data each field holds. This often takes the form of a textual description, plus a note of the data type (text, decimals, date, boolean...) and/or a set of expected values. -A data dictionary is required by AutoParser for (parser generation)[parser_generation]. +A data dictionary is required by AutoParser for [parser generation](parser_generation). This is to avoid having to send potentially sensitive or confidential data to an external body (in this case an externally hosted LLM hosted); instead a *decription* of what the data looks like from the dictionary can be sent to the LLM, which allows for mapping to occur without risking the unintentional release of data. -Many data capture services such as (REDCaP)[https://projectredcap.org/] will generate +Many data capture services such as [REDCaP](https://projectredcap.org/) will generate a data dictionary automatically when surveys are set up. However, where data is being captured either rapidly, or by individuals/small teams, a formal data dictionary may not have been created for a corresponding dataset. For this scenario, AutoParser provides @@ -54,10 +54,8 @@ to generate a data parser. ```{eval-rst} .. autofunction:: autoparser.create_dict - :noindex: .. autofunction:: autoparser.generate_descriptions - :noindex: ``` diff --git a/docs/usage/index.md b/docs/usage/index.md new file mode 100644 index 0000000..7b7293b --- /dev/null +++ b/docs/usage/index.md @@ -0,0 +1,8 @@ +# Usage + +These sections describe the key usage patterns for AutoParser. + +```{toctree} +data_dict +parser_generation +``` \ No newline at end of file diff --git a/docs/usage/parser_generation.md b/docs/usage/parser_generation.md index e5b17a8..e4a97ba 100644 --- a/docs/usage/parser_generation.md +++ b/docs/usage/parser_generation.md @@ -1,13 +1,13 @@ # Write a Data Parser -AutoParser assumes the use of Global.Health's (adtl)[https://github.com/globaldothealth/adtl] +AutoParser assumes the use of Global.Health's [adtl](https://github.com/globaldothealth/adtl) package to transform your source data into a standardised format. To do this, adtl requires a -(TOML)[https://toml.io/en/] specification file which describes how raw data should be +[TOML](https://toml.io/en/) specification file which describes how raw data should be converted into the new format, on a field-by-field basis. Every unique data file format (i.e. unique sets of fields and data types) should have a corresponding parser file. AutoParser exists to semi-automate the process of writing new parser files. This requires -a data dictionary (which can be created if it does not already exist, see [data_dict]), +a data dictionary (which can be created if it does not already exist, see '[Create Data dictionary](data_dict)'), and the JSON schema of the target format. Parser generation is a 2-step process. diff --git a/src/autoparser/create_mapping.py b/src/autoparser/create_mapping.py index 3c4ea93..ab2e7be 100644 --- a/src/autoparser/create_mapping.py +++ b/src/autoparser/create_mapping.py @@ -67,6 +67,7 @@ def __init__( @property def target_fields(self) -> list[str]: + """Returns a list of fields in the target schema""" try: return self._target_fields except AttributeError: @@ -75,6 +76,7 @@ def target_fields(self) -> list[str]: @property def target_types(self) -> dict[str, list[str]]: + """Returns the field types of the target schema""" try: return self._target_types except AttributeError: @@ -86,6 +88,7 @@ def target_types(self) -> dict[str, list[str]]: @property def target_values(self) -> pd.Series: + """Returns the enum values or boolean options for the target schema""" try: return self._target_values except AttributeError: @@ -106,6 +109,10 @@ def _value_options(f): @property def common_values(self) -> pd.Series: + """ + Returns the commonly repeated values in the source data + Usually this indicates that the source field is an enum or boolean + """ try: return self._common_values except AttributeError: diff --git a/src/autoparser/make_toml.py b/src/autoparser/make_toml.py index 1df1cee..a776edc 100644 --- a/src/autoparser/make_toml.py +++ b/src/autoparser/make_toml.py @@ -111,7 +111,8 @@ def __init__( ) @property - def parsed_choices(self): + def parsed_choices(self) -> pd.Series: + """Returns the mapped values for each taget field""" try: return self._parsed_choices except AttributeError: @@ -124,7 +125,8 @@ def _parse_choices(s: str): return self._parsed_choices @property - def references_definitions(self): + def references_definitions(self) -> tuple[dict[str, str], dict[str, dict]]: + """Finds and returns the references and definitions for the mappings""" try: return self._references_definitions except AttributeError: @@ -158,11 +160,11 @@ def references_definitions(self): return self._references_definitions def schema_fields(self, table: str): - "Returns all the fields for `table` and their properties" + """Returns all the fields for `table` and their properties""" return self.schemas[table]["properties"] - def single_field_mapping(self, match: pd.core.frame.pandas) -> dict[str, Any]: - "Make a single field mapping from a single row of the mappings dataframe" + def single_field_mapping(self, match: pd.DataFrame) -> dict[str, Any]: + """Make a single field mapping from a single row of the mappings dataframe""" choices = self.parsed_choices[match.target_field] @@ -178,7 +180,7 @@ def single_field_mapping(self, match: pd.core.frame.pandas) -> dict[str, Any]: return out def make_toml_table(self, table: str) -> dict[str, Any]: - "Make single TOML table from mappings" + """Make single TOML table from mappings""" outmap = {} @@ -289,10 +291,11 @@ def create_parser( def main(): parser = argparse.ArgumentParser( - description="Make TOML from intermediate CSV file created by create_mapping.py" + description="Make TOML from intermediate CSV file created by create_mapping.py", + prog="autoparser create-parser", ) parser.add_argument("mappings", help="Mapping file to create TOML from", type=str) - parser.add_argument("schema", help="Path where schemas are located") + parser.add_argument("schema_path", help="Path where schemas are located") parser.add_argument( "-n", "--name",