diff --git a/CHANGELOG.md b/CHANGELOG.md index f9b73faa7..b57b4aec6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,15 @@ +## 0.13.0 (2025-01-24) + +### Feat + +- ✨ output everything from code cells in Reference docs (#997) + +## 0.12.2 (2025-01-24) + +### Refactor + +- :recycle: `path_package_properties()` -> `path_properties()` (#996) + ## 0.12.1 (2025-01-23) ### Refactor diff --git a/_quarto.yml b/_quarto.yml index c16512b4c..2fbadd8f8 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -52,9 +52,8 @@ website: - section: "Architecture" href: docs/design/architecture/index.qmd contents: - - docs/design/architecture/requirements.qmd - docs/design/architecture/naming.qmd - - docs/design/architecture/modular-design.qmd + - docs/design/architecture/input-data.qmd - section: "Interface" href: docs/design/interface/index.qmd contents: @@ -94,9 +93,9 @@ quartodoc: desc: "Functions to work with and manage data resources found within a data package." package: "seedcase_sprout.core" contents: - - write_resource_properties - create_resource_properties - create_resource_structure + - write_resource_properties - subtitle: "Property dataclasses" desc: "Dataclasses that support creating correct data package properties." diff --git a/_renderer.py b/_renderer.py index cac1d6803..c882a1e18 100644 --- a/_renderer.py +++ b/_renderer.py @@ -12,7 +12,20 @@ class Renderer(MdRenderer): - style = "fix-output-of-returns-and-raises-without-names" + style = "seedcase" + + @dispatch + def render_header(self, el: layout.Doc) -> str: + """Render the header of a docstring, including any anchors.""" + _str_dispname = el.name + + _anchor = f"{{ #{el.obj.path} }}" + + # For lvl 1 headers, add a yml header with the ipynb-shell-interactivity setting + # to get all output from the cell + if self.crnt_header_level == 1: + return f"---\nipynb-shell-interactivity: all\ntitle: {_str_dispname}\n---" + return f"{'#' * self.crnt_header_level} {_str_dispname} {_anchor}" # returns ---- diff --git a/docs/design/architecture/input-data.qmd b/docs/design/architecture/input-data.qmd new file mode 100644 index 000000000..bfc9ad048 --- /dev/null +++ b/docs/design/architecture/input-data.qmd @@ -0,0 +1,96 @@ +--- +title: "Input data" +--- + +The types of data we expect or anticipate to be input into Sprout are +described in this section. We design Sprout with these types of data and +formats in mind. + +## Domain-specific types of data + +Currently, we only have experience with health data, so we have a bias +towards that type of data. + +### Health research + +Health research data tends to consist of these types of data: + +- **Clinical**: This data is typically collected during patient visits + to doctors. Depending on the country or administrative region, there + will likely already be well-established data processing and storage + pipelines in place. +- **Register**: This type of data is highly dependent on the country + or region. Generally, this data is collected for national or + regional administrative purposes, such as recording employment + status, income, address, medication purchases, and diagnoses. Like + for routine clinical data, the pipelines in place for processing and + storing this data are usually very extensive and well + established. +- **Biological sample data**: This type of data is generated from + biological samples, like blood, saliva, semen, hair, or urine. Data + generated from sample analytic techniques often produce large + volumes of data per person. Samples may be generated in larger + established laboratories or in smaller research groups, depending on + what analytic technology is used and how new it is. The structure + and format of the generated data also tend to be highly variable + and depend heavily on the technology used, sometimes requiring + specialized software to process and output. +- **Survey or questionnaire**: This type of data is often collected based + on a given study's aims and research questions. There are hundreds + of different questionnaires that can have highly specific purposes + and uses for their data. They are also highly variable in the volume + of data collected based on the survey, and on the format of the + data. + +## File and data formats + +While we aim to handle a wide variety of data types, we will start with +the most common types of formats. We also have a limitation or +restriction that the data format needs to be open source and not +proprietary, since we cannot process it if we don't have the software to +read it. + +The file formats we expect to work with are text (`.txt`) files, various +forms of comma-separated value (`.csv`) files, Excel (`.xls` or `.xlsx`) +files (technically closed source but practically easy to read), images, +audio, XML, JSON, and potentially some SQL databases. + +## Flow or frequency of data collection + +In research (and even in most industry settings), we rarely encounter +truly real-time data collection. Most data collection is done in +"batches", with data being collected at irregular and inconsistent +intervals and then stored to be processed later. This batch +collection can be broken down into two categories based on its +frequency: + +- *Routine or continuous collection*, where data is collected on a + more regular interval and in smaller batches of "observational + units"[^1]. Ingestion or processing of this type of data may happen + on a more regular basis. Clinical data as well as survey or + questionnaire data may likely fall under this category. For example, + data collected on a few patients seen during the day at a clinic. +- *Grouped collection*, where data is collected from many observational + units during a short period of time at very irregular intervals or + potentially only once. Data ingesting or processing occurs some time + after all the data has been collected. Biological sample data + would fall under this category, since laboratories usually run + several samples at once and input data after internal quality + control checks and machine-specific data processing. While + register-based and clinical data usually get collected + continuously, direct access to them is only given on a batch and + infrequent basis, so they may also fall under this category. Survey + data may also come in batches, depending on the questionnaire and + software used for its collection. + +[^1]: Observational unit is the "entity" that the data was collected + from at a given point in time, such as a human participant in a + cohort study or a rat in an animal study at a specific time point. + +Regardless of the flow or frequency of data generation and collection, +the ability to automatically ingest the data into Sprout will vary wildly +based on the data source, the organization who generates the data, and +their technical expertise. Some data sources may have well-established, +but not always programmatic or automatic, workflows and processes. +Others may not have any workflow and it may be an extremely manual +process. diff --git a/docs/design/architecture/naming.qmd b/docs/design/architecture/naming.qmd index 8ac6b8f47..7df57a192 100644 --- a/docs/design/architecture/naming.qmd +++ b/docs/design/architecture/naming.qmd @@ -63,7 +63,7 @@ We may also occasionally use "properties" to refer to the file itself. | Action | Description | |----------------------------|--------------------------------------------| | create | Create a new object. | -| construct | Construct or reconstruct a data file (like the Parquet file). | +| build | Build implies either creating a new object or recreating an existing one, e.g. (re-)build a file like the README or Parquet file. | | view | View details about an object. | | list | List basic details about many objects. | | edit | Edit an object, specifically the properties object. | diff --git a/docs/design/architecture/runtime-view.qmd b/docs/design/architecture/runtime-view.qmd deleted file mode 100644 index 855b44444..000000000 --- a/docs/design/architecture/runtime-view.qmd +++ /dev/null @@ -1,209 +0,0 @@ ---- -title: "Dynamic runtime behaviour" ---- - -This section describes the behaviour and interactions that individual -components of Sprout have with each other over time. For instance, what -is the sequence of chronological steps that happens when a user inputs -data until the final output of a structured database. "Runtime" in this -case refers to how the software works "in action". - -## Login and Authentication - -Almost all users will need to log into the Sprout-managed Data -Resources. The steps for logging in and having their permission levels -checked follows the sequence described in the figure below. - -![Login and authentication sequence of steps for a registered -user.](images/runtime/runtime-login-sequence.svg) - -For a more general discussion of our authentication strategy and -potential alternatives, see the -[Authentication](security.qmd#authentication) section on the Security -page. - -## Data Input - -The overall aim of this section is to describe the general path/sequence -of steps that data takes through a Data Resource, from raw input into -the final output. Specifically, these items are described as: - -- *Input*: Because we currently focus on health research, the type of - input data and metadata would be what is typically generated from - health studies. This could be in the form of e.g., CSV, Excel - sheets, or image files. -- *Output*: The final output is the input data stored together as a - single database, or at least multiple databases and files explicitly - linked in such a way that it conceptually represents a single - database. - -### Expected Type of Input Data - -Given our focus on health data as well as the team's expertise in -research using health data, we make some assumptions about the type of -data that will be input into Sprout. Health data tends to consist of -specific types of data: - -- **Clinical**: This data is typically collected during patient visits - to doctors. Depending on the country or administrative region, there - will likely already be well-established data processing and storage - pipelines in place. -- **Register**: This type of data is highly dependent on the country - or region. Generally, this data is collected for national or - regional administrative purposes, such as, recording employment - status, income, address, medication purchases, and diagnoses. Like - the routine clinical data, the pipelines in place for processing and - storage of this data are usually very extensive and well - established. -- **Biological sample data**: This type of data is generated from - biological samples, like blood, saliva, semen, hair, or urine. Data - generated from sample analytic techniques often produce large - volumes of data per person. Samples may be generated in larger - established laboratories or in smaller research groups, depending on - what analytic technology is used and how new it is. The structure - and format of the generated data also tends to be highly variable - and depends heavily on the technology used, sometimes requiring - specialized software to process and output. -- **Survey or questionnaire**: This type of data is often done based - on a given study's aims and research questions. There are hundreds - of different questionnaires that can have highly specific purposes - and uses for their data. They are also highly variable in the volume - of data collected based on the survey, and on the format of the - data. - -These types of input data are formatted in a wide variety of files, -including as text (`.txt`) files, comma-separated value (`.csv`) files, -Excel (`.xls` or `.xlsx`) files, as well as other proprietry formats. - -### Expected Flow of Input Data - -The above described data tends to fit into, mostly, two categories for -data input. - -- *Routine or continuous collection*, where ingested data into Sprout - would occur as soon as the data was collected from one - "observational unit"[^1] or very shortly afterwards. Clinical data - as well as survey or questionnaire data may likely fall under this - category. -- *Batch collection*, where ingested data occurs some time after the - data was collected and from multiple observational units. Biological - sample data would fall under this category, since laboratories - usually run several samples at once and input data after internal - quality control checks and machine-specific data processing. While - register-based and clinical data does get collected continuously, - direct access to it is only given on an batch and infrequent basis. - Survey data may also come in batches, depending on the questionnaire - and software used for its collection. - -[^1]: Observational unit is the "entity" that the data was collected - from at a given point in time, such as a human participant in a - cohort study or a rat in an animal study at a specific time point. - -For sources of data from routine collection with well-established data -input processes, the data input pipeline would likely involve -redirecting these data sources from their generation into Sprout via a -direct call to the API so the data continues on to the backend and -eventual data storage. - -Sources of data that don't have well-established data input processes, -such as data from hospitals or medical laboratories, would need to use -the Sprout data batch-input Web Portal. This Portal only accepts data -that is in a pre-defined format (as determined and created by the Data -Management Administrators) that includes documentation, and potentially -automation scripts on how to pre-process the data prior to uploading it. - -These uploaded files might be a variety of file types, like `.csv`, -`.xls`, or `.txt`). Only users with the correct permission levels are -allowed to upload data. It will be the Data Access Administrator who -will be doing the initial upload, as that will entail setting up table -schemas and allocating space in the raw data file storage. The second -way of getting data into the Data Resource is by manually entering it by -an authorized Data Contributor. - -Once the data is submitted through the Portal, it is sent in an -encrypted, legally-compliant format to a server and stored in the way -defined by the API and data model. - -### Upload Data to Sprout - -An approved user, i.e., a Data Access Administrator or a Data -Contributor, will open the login screen in the Web Portal. They will -enter their credentials which will be transmitted to the API layer. The -API Security layer will check with the list of users and permissions in -the database and confirm that the specific user has permission to enter -data into a specific table (or set of tables) in the database. - -Once this check is complete the frontend will receive permission from -the API Security layer to display the data entry/upload options for this -kind of user role. - -Before any of the actions described below can be done, it is expected -that appropriate table schemas or entry forms have been created by one -or more administrators of the system. This process is described -elsewhere. - - - -#### Batch Upload of Data - - - -The user has selected an existing table schema to use, and has uploaded -the file to the holding area. This prompts the system to check that the -data in the file match the schema in the database on headers and data -type. If this validation is successful then the system will inform the -user about how many rows of data it found and has validated. If the user -is in agreement, then the system will write the data into the relevant -table and display a confirmation back to the user. Should the user -disagree with the number of rows then they can cancel the upload and -investigate issues within the file, which is an action that happens -outside of Sprout. - -![Logged in user chooses to use the batch upload function with an -existing table -schema.](images/user-flow/user-flow-data-upload.svg){#fig-batch-data-entry} - - - -#### Manual Data Entry: Done in One Session - -The user completes all fields in the form and clicks "Save and Submit". -This sends the data to the API layer where it is confirmed as valid, -parcelled up and submitted to the database. The database will then write -the data into a new record in the table (or tables). Once done the -database will confirm successful entry of data to the API which will in -turn send the confirmation back to the user via the frontend. - -![Logged in user manually writes a new row to the Data -Resource.](images/runtime/runtime-manual-data-entry.svg){#fig-manual-data-entry} - -#### Manual Data Entry: Done in Multiple Sessions - -When a user can't finish inputting data in one session, they can save -the "state" of inputted data to return to it later. Much of the initial -workflow is the same as above, until the user is interrupted and selects -"Save" instead of "Save and Submit". This will send the data to the API -with a flag showing that fields may be incomplete, thus preventing the -API from rejecting the data due to NULL values. The API will submit the -data to the database along with the incomplete flag. - -When the Data Contributor goes back to the data entry at a later time, -they will be presented with the option of completing any incomplete -records as well as entering new data. If they click on "Complete -Records" they are shown the records that they have started but not -submitted. Once they select a partially completed record the frontend -will request the currently completed items from the database via the API -layer before displaying the entry form with the completed fields. - -Once the user has completed more data they can either click on "Save" or -"Save and Submit". The first option will put them back to the top of -this workflow, the second will send the data back to the API layer for -validation. Once the data is validated it will be submitted to the -database. The database will then write the data into a new record in the -table (or tables) and update the flag to show the record is complete. -Once done the database will confirm successful entry of data to the API -which will in turn send the confirmation back to the user via the front -end. - -![Logged in user enters data manually in more than one -session.](images/runtime/runtime-manual-data-update.svg){#fig-manual-data-update} diff --git a/docs/design/interface/images/core/create-package-structure.puml b/docs/design/interface/images/core/create-package-structure.puml deleted file mode 100644 index 169740872..000000000 --- a/docs/design/interface/images/core/create-package-structure.puml +++ /dev/null @@ -1,93 +0,0 @@ -@startuml create-package-structure -!theme seedcase from https://raw.githubusercontent.com/seedcase-project/seedcase-theme/main - -state "Input" as input { - input: - Use `path_*()` functions to help\ncreate correct paths -' Arguments ----- - path : - Path to where packages are stored -} - -state "Output" as output { - output : - The package parent folder is shown as / - state "/packages//datapackage.json" as properties_file - state "/packages//README.md" as readme -} - -state "create_package_structure()" as cps { - cps : - Required arg: path - - state arg <> - input --> arg - - state "verify_is_dir()" as vid - vid : - Required arg: path - vid : - Output: path or Error - arg --> vid - - state "get_ids()" as gi - gi : - Required arg: path - gi : - Output: list of existing IDs\n (otherwise an empty list) - vid --> gi - - state "create_next_id()" as cni - cni : - Required arg: list of existing IDs - cni : - Output: ID - gi --> cni - - state "create_id_path()" as cip - cip : - Required arg: path and ID to assign for directory - cip : - Output: path of created directory - cni --> cip - arg --> cip - - state "create_dirs()" as cd - cd : - Required arg: A list of paths - cd : - Output: A list of paths\n of created directories - cip --> cd - - state "create_default_package_properties()" as cdpp - cdpp : - Output: JSON object - cdpp : - Uses properties dataclasses\n internally and sets default values - state "create_readme_text()" as crt - crt : - Required arg: JSON object - crt : - Output: string - cdpp --> crt - - state "create_readme_path()" as crp - crp : - Required arg: path - crp : - Output: path to README - cd --> crp - - state "create_properties_path()" as cprop - cprop : - Required arg: path - cprop : - Output: path to properties - cd --> cprop - - state "write_file()" as wf - wf : - Required args: string (including JSON\n as a string) and path - wf : - Output: path of created file - wf : - Only one string and path at a time - wf : - Used multiple times for each write\n operation - - state readme_input <> - state prop_input <> - state as_list <> - state return <> - - cdpp --> prop_input - crt --> readme_input - cprop --> prop_input - crp --> readme_input - - prop_input --> wf : input to write\nproperties file - readme_input --> wf : input to write\nreadme file - - wf --> as_list : Convert to list - as_list --> return - return --> output - - crp -[hidden]> cdpp - cdpp-[hidden]> cprop -} - -@enduml diff --git a/docs/design/interface/images/core/create-package-structure.svg b/docs/design/interface/images/core/create-package-structure.svg deleted file mode 100644 index dca6cd89a..000000000 --- a/docs/design/interface/images/core/create-package-structure.svg +++ /dev/null @@ -1 +0,0 @@ -create_package_structure()- Required arg: pathargverify_is_dir()- Required arg: path- Output: path or Errorget_ids()- Required arg: path- Output: list of existing IDs(otherwise an empty list)create_next_id()- Required arg: list of existing IDs- Output: IDcreate_id_path()- Required arg: path and ID to assign for directory- Output: path of created directorycreate_dirs()- Required arg: A list of paths- Output: A list of pathsof created directoriescreate_default_package_properties()- Output: JSON object- Uses properties dataclassesinternally and sets default valuescreate_readme_text()- Required arg: JSON object- Output: stringcreate_readme_path()- Required arg: path- Output: path to READMEcreate_properties_path()- Required arg: path- Output: path to propertieswrite_file()- Required args: string (including JSONas a string) and path- Output: path of created file- Only one string and path at a time- Used multiple times for each writeoperationreturnInput- Use `path_*()` functions to helpcreate correct pathspath- Path to where packages are storedOutput- The package parent folder is shown as //packages/<id>/datapackage.json/packages/<id>/README.mdinput to writeproperties fileinput to writereadme fileConvert to list diff --git a/docs/design/interface/images/core/edit-package-properties.puml b/docs/design/interface/images/core/edit-package-properties.puml deleted file mode 100644 index 30388b426..000000000 --- a/docs/design/interface/images/core/edit-package-properties.puml +++ /dev/null @@ -1,57 +0,0 @@ -@startuml edit-package-properties -!theme seedcase from https://raw.githubusercontent.com/seedcase-project/seedcase-theme/main - -state "Input" as input { - input : - `path_*()` functions assist with giving correct paths -' Arguments ----- - path : - Path to `datapackage.json` file\n in the package `` folder - path : - User may or may not have directly\n edited the JSON file - properties: - JSON object of the package\n following the Data Package spec -} - -state "Output" as output { - state "JSON" as json_output - json_output : - JSON object of the package following\n the Data Package spec - json_output : - Use `write_package_properties()` to\n save back to `datapackage.json` -} - -state "edit_package_properties()" as cps { - cps : - Required arg: path and properties - cps : - Takes the current properties file and updates\n the properties with the given fields - - state arg_path <> - state arg_properties <> - - path --> arg_path : path object - properties --> arg_properties : JSON object - - state "verify_is_file()" as vif - vif : - Required arg: path - vif : - Output: path or Error - arg_path --> vif - - state "check_package_properties()" as cpp - cpp : - Required arg: properties - cpp : - Output: JSON or Error - cpp : - Checks that all fields\n and values match spec - cpp : - Checks that required\n fields are present - arg_properties --> cpp - - state "read_properties()" as rp - rp : - Required arg: path - rp : - Output: JSON - vif --> rp : path object - - state "join_properties()" as jp - jp : - Required args: current_properties\n and new_properties - jp : - Output: JSON - jp : - New properties overwrite old ones - rp --> jp : JSON object - cpp --> jp : JSON object - - state "return" <> - jp --> return : JSON object - return --> output -} - -@enduml diff --git a/docs/design/interface/images/core/edit-package-properties.svg b/docs/design/interface/images/core/edit-package-properties.svg deleted file mode 100644 index cba5086d4..000000000 --- a/docs/design/interface/images/core/edit-package-properties.svg +++ /dev/null @@ -1 +0,0 @@ -Input- `path_*()` functions assist with giving correct pathsedit_package_properties()- Required arg: path and properties- Takes the current properties file and updatesthe properties with the given fieldspath- Path to `datapackage.json` filein the package `<id>` folder- User may or may not have directlyedited the JSON fileproperties- JSON object of the packagefollowing the Data Package specarg_patharg_propertiesverify_is_file()- Required arg: path- Output: path or Errorcheck_package_properties()- Required arg: properties- Output: JSON or Error- Checks that all fieldsand values match spec- Checks that requiredfields are presentread_properties()- Required arg: path- Output: JSONjoin_properties()- Required args: current_propertiesand new_properties- Output: JSON- New properties overwrite old onesreturnOutputJSON- JSON object of the package followingthe Data Package spec- Use `write_package_properties()` tosave back to `datapackage.json`path objectJSON objectpath objectJSON objectJSON objectJSON object diff --git a/docs/design/interface/python-functions.qmd b/docs/design/interface/python-functions.qmd index f03ad36d4..43629dfe3 100644 --- a/docs/design/interface/python-functions.qmd +++ b/docs/design/interface/python-functions.qmd @@ -5,33 +5,32 @@ callout-appearance: "minimal" --- ::: {.callout-important appearance="default" icon="true"} -We created this document, and especially the function diagrams, mainly -as a way to help us as a team all understand and agree on what we're -making and what needs to be worked on. Which means that the descriptions -and explanations of these functions, in particular the diagrams, will -likely change quite a bit and may even be deleted later when they are no -longer needed. +We created this document mainly as a way to help us as a team all +understand and agree on what we're making and what needs to be worked +on. Which means that the descriptions and explanations of these +functions will likely change quite a bit and may even be deleted later +when they are no longer needed. ::: Based on the [naming scheme](/docs/design/architecture/naming.qmd) and the [Frictionless Data Package standard](https://datapackage.org), these -are the core external-facing functions in Sprout, which are stored in -`sprout/core/`. See the [Outputs](outputs.qmd) section for an overview -and explanation of the different outputs provided by Sprout. +are the core external-facing functions in Sprout. See the +[Outputs](outputs.qmd) section for an overview and explanation of the +different outputs provided by Sprout. There are some small differences between the naming scheme and the functions described here: 1. Whenever `view` is used, it is always for one object (either a package or a resource). Whenever `list` is used, it is always to - show basic details of all existing objects of a given type (packages - or resources). Both `view` and `list` only ever show the information + show basic details of all existing objects of a given type (e.g. + resources). Both `view` and `list` only ever show the information contained in the `datapackage.json` and never show actual data. We - don't provide the ability to view data directly since it is [out of - scope](index.qmd#goals) of Sprout and we want to minimise any - security and privacy risks. The IT admin and data package admin - users can always view the data directly (outside of Sprout) as they - will have the appropriate IT and legal permissions. + don't provide the ability for Sprout to show personally-identifiable + data directly since it is [out of scope](index.qmd#goals) of Sprout + and we want to minimise any security and privacy risks. Those with + at least read access to the filesystem will still be able to see the + data directly. 2. `edit` only ever edits the `datapackage.json` which contains the properties/metadata of the package or resource(s) but never edits the data itself. If there is a need for updates to the data itself, @@ -52,29 +51,23 @@ functions to get the correct path object for the specific function. It's designed this way to make it more flexible to where individual packages and resources are stored and to make it a bit easier to write tests for the functions. For a similar reason, most of the functions output either -a JSON file or path object to make them easier to test. +a `dict` Python object, a custom `Properties` dataclass, or a path +object to make them easier to test. -Several of the functions have an optional argument called `properties`. -The properties argument is a list of key-value pairs (as a JSON object) -that describe the package and resource(s) in the package. This metadata -is stored in the `datapackage.json` file and follows the Frictionless -Data specification. +Several of the functions have an argument called `properties`. The +properties argument is a list of key-value pairs (as a JSON-style `dict` +object), built using the `Properties` object, that describes the package +and resource(s) in the package. This metadata is stored in the +`datapackage.json` file and follows the Frictionless Data specification. -## Data package functions - -::: {.callout-note collapse="true"} -### `list_packages(path)` - -This function lists some basic details contained in the -`datapackage.json` files of all data packages found in the `path` -location. This would be a list, showing very basic information that -would be allowed by privacy and legal regulations (e.g. the package ID -as well as the name, description, and contact persons for each package). -Use `path_packages()` to provide the path to the place packages are -stored by default. Outputs a JSON object by default. +::: {.callout-important appearance="default" icon="true"} +Functions shown in orange are not yet implemented while those in blue +are implemented. ::: -::: {.callout-note collapse="true"} +## Data package functions + +::: {.callout-warning collapse="true"} ### `view_package_properties(path)` This will show the information contained within the `datapackage.json` @@ -84,57 +77,53 @@ and only basic details of the resources within the package. Use object. ::: -::: {.callout-note collapse="true"} -### `create_package_structure(path)` +::: {.callout-warning collapse="true"} +### `create_package_properties(properties)` This is the first function to use to create a new data package. It -assigns a package ID and then creates a package folder and all the -necessary files for a package (excluding the resources), as described in -the [Outputs](outputs.qmd) section. Creates the files and then outputs -the file paths for the created files. Use `path_packages()` to provide -the correct path location to create this structure. +generates a template for the `datapackage.json` file where the user +needs to fill in required fields by providing the `properties` argument +using the helper `PackageProperties` class. Outputs a full Properties +object. Use `write_package_properties()` to store the properties to the +`datapackage.json` file. +::: -![Diagram showing the internal function flow of the -`create_package_structure()` -function.](images/core/create-package-structure.svg){#fig-create-package-structure -fig-alt="A Plant UML schematic of the detailed code flow within the `create_package_structure()` function."} +::: {.callout-warning collapse="true"} +### `build_readme_text(properties)` + +Using a template, this will build a README file with the contents of the +properties object in a human-readable format. Outputs a text string. Use +`write_text()` to save the text to the `README.md` file. ::: ::: {.callout-note collapse="true"} ### `edit_package_properties(path, properties)` -See the help documentation with `help(edit_package_properties())` for more details. - -![Diagram showing the internal function flow of the -`edit_package_properties()` -function.](images/core/edit-package-properties.svg){#fig-edit-package-properties -fig-alt="A Plant UML schematic of the detailed code flow within the `edit_package_properties()` function."} +See the help documentation with `help(edit_package_properties())` for +more details. ::: -::: {.callout-note collapse="true"} +::: {.callout-warning collapse="true"} ### `delete_package(path, confirm)` Completely delete a specific package and all it's data resources. Because this action would be permanent, the `confirm` argument would default to `false` so that the user needs to explicitly provide `true` to the function argument as confirmation. This is done to prevent -accidental deletion. Use `path_package()` function in the `path` to get -the correct location. Outputs `true` if the deletion was successful. +accidental deletion. Outputs `true` if the deletion was successful. ::: -::: {.callout-note collapse="true"} +::: {.callout-warning collapse="true"} ### `write_package_properties(properties, path)` -Writes JSON object containing package properties (not including resource -properties) back to the `datapackage.json` file. The `path` argument is -the location of the `datapackage.json` file. Use the `path_properties()` -function to provide this path to the correct location. Returns the same -path object as given in the `path` argument. - -![Diagram showing the internal function flow of the -`write_package_properties()` -function.](images/core/write-package-properties.svg){#fig-write-package-properties -fig-alt="A Plant UML schematic of the detailed code flow within the `write_package_properties()` function."} +Writes the properties object containing package properties (with or +without resource properties) back to the `datapackage.json` file. The +`path` argument is the path to the folder that will have or currently +has the `datapackage.json` file. The `path` argument is to the existing +but empty folder or to a folder that doesn't exist but will be created +by this function. Will overwrite the `datapackage.json` file by default +if the file already exists. Returns the same path object as given in the +`path` argument. ::: ## Data resource functions @@ -428,6 +417,38 @@ print(path_resource_data(4, 2)) ``` ::: +## Properties dataclasses + +These dataclasses contain an explicit, structured set of official +properties defined within a data package. The main purpose of these is +to allow us to pass structured properties objects between functions. +They also enable users to create valid properties objects more easily +and get an overview of optional and required class fields. + +::: {.callout-note collapse="true"} +### `PackageProperties` + +See the help documentation with `help(PackageProperties())` for more +details on the properties. +::: + +## Properties functions + +::: {.callout-warning collapse="true"} +### `read_properties(path)` + +Reads the `datapackage.json` file, checks that is correct, and then +outputs a `PackageProperties` object. +::: + +::: {.callout-warning collapse="true"} +### `view_properties(path)` + +Reads the `datapackage.json` file, checks that is correct, and then +outputs a human-readable version of the properties as a string. +::: + + ::: {.callout-note collapse="true"} ### `path_properties(package_id)` @@ -487,12 +508,13 @@ print(path_package(10)) ::: {.callout-note collapse="true"} ### `path_sprout_global()` -If the `SPROUT_GLOBAL` environment variable isn't provided, this function -will return the default path to where data packages will be stored. The -default locations are dependent on the operating system. This function -also creates the necessary directory if it doesn't exist. +If the `SPROUT_GLOBAL` environment variable isn't provided, this +function will return the default path to where data packages will be +stored. The default locations are dependent on the operating system. +This function also creates the necessary directory if it doesn't exist. -![Diagram showing the internal function flow of the `path_sprout_global()` +![Diagram showing the internal function flow of the +`path_sprout_global()` function.](images/core/path-sprout-global.svg){#fig-path-sprout-global fig-alt="A Plant UML schematic of the detailed code flow within the `path_sprout_global()` function."} @@ -507,130 +529,6 @@ print(path_sprout_global()) ``` [PosixPath('~/.sprout')] ``` -::: - -## Properties dataclasses - -These dataclasses contain an explicit, structured set of official -properties defined within a data package. The main purpose of these is -to allow us to pass structured properties objects between functions. -They also enable users to create valid properties objects more easily -and get an overview of optional and required class fields. - -::: {.callout-note collapse="true"} -### `PackageProperties(...)` - -Creates a dataclass object with all the necessary properties for the top -level metadata of a data package. - -#### Example usage - -``` python -print(PackageProperties()) -``` - - - -``` -PackageProperties(title=None, description=None, licenses=None, contributors=None, resources=None) -``` - -``` python -print(PackageProperties(title="Diabetes Cohort")) -``` - -``` -PackageProperties(title="Diabetes Cohort", description=None, licenses=None, contributors=None, resources=None) -``` - -``` python -print(PackageProperties(licenses=[LicenseProperties(name="ODC-BY-1.0")])) -``` - -``` -PackageProperties(title=None, description=None, licenses=[LicenseProperties(name="ODC-BY-1.0")], contributors=None, resources=None) -``` -::: - -::: {.callout-note collapse="true"} -### `ResourceProperties(...)` - -Creates a dataclass object with all the necessary properties for a -resource, which would be given in the `resources` field of a -`PackageProperties` object. - -#### Example usage - -``` python -print(ResourceProperties()) -``` - -``` -ResourceProperties(name=None, description=None, path=None, schema=None) -``` - -``` python -print(ResourceProperties(name="Blood Samples")) -``` - -``` -ResourceProperties(name="Blood Samples", description=None, path=None, schema=None) -``` -::: - -::: {.callout-note collapse="true"} -### `ContributorProperties(...)` - -Creates a dataclass object with all the necessary properties for a -contributor. This would be given in the `contributors` field of a -`PackageProperties` object. - -#### Example usage - -``` python -print(ContributorProperties()) -``` - -``` -ContributorProperties(title=None, email=None, roles=None) -``` -::: - -::: {.callout-note collapse="true"} -### `LicenseProperties(...)` - -Creates a dataclass object with all the necessary properties for a -license, so that it can be added to the `licenses` field of a -`PackageProperties` object. - -#### Example usage - -``` python -print(LicenseProperties()) -``` - -``` -LicenseProperties(name=None, path=None, title=None) -``` -::: - -::: {.callout-note collapse="true"} -### `TableSchemaProperties(...)` - -Creates a dataclass object with all the necessary properties for a table -schema, so that it can be added to the `schema` field of a -`ResourceProperties` object. - -#### Example usage - -``` python -print(TableSchemaProperties()) -``` - -``` -TableSchemaProperties(fields=[], missingValues=[], primaryKey=[], foreignKeys=[]) -``` -::: ## Helper functions diff --git a/poetry.lock b/poetry.lock index 273518cd5..2141902dd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "annotated-types" diff --git a/pyproject.toml b/pyproject.toml index c72743d78..608983b5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "seedcase-sprout" -version = "0.12.1" +version = "0.13.0" description = "Grow your research data in a structured, modern way that follows best practices." authors = [ "Luke W. Johnston ", diff --git a/seedcase_sprout/core/__init__.py b/seedcase_sprout/core/__init__.py index 6a478c621..f3e6ffafa 100644 --- a/seedcase_sprout/core/__init__.py +++ b/seedcase_sprout/core/__init__.py @@ -23,8 +23,8 @@ # TODO: Consider having all these in one module. from .path_package_functions import ( path_package, - path_package_properties, path_packages, + path_properties, ) from .path_resource_functions import ( path_resource, @@ -94,7 +94,7 @@ # "delete_resource_properties", # Path ----- "path_package", - "path_package_properties", + "path_properties", "path_packages", "path_resource", "path_resource_data", diff --git a/seedcase_sprout/core/create_package_structure.py b/seedcase_sprout/core/create_package_structure.py index 21b3791bc..182245028 100644 --- a/seedcase_sprout/core/create_package_structure.py +++ b/seedcase_sprout/core/create_package_structure.py @@ -32,6 +32,21 @@ def create_package_structure(path: Path) -> list[Path]: NotADirectoryError: If the directory in the path doesn't exist. FileNotFoundError: If a file could not be created. TypeError: If `datapackage.json` could not be created. + + Examples: + ```{python} + import tempfile + from pathlib import Path + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create a package structure + sp.create_package_structure(path=temp_path) + ``` """ check_is_dir(path) ids = get_ids(path) diff --git a/seedcase_sprout/core/create_resource_properties.py b/seedcase_sprout/core/create_resource_properties.py index 99ca160ef..c87ef71e4 100644 --- a/seedcase_sprout/core/create_resource_properties.py +++ b/seedcase_sprout/core/create_resource_properties.py @@ -39,6 +39,31 @@ def create_resource_properties( NotADirectoryError: If path does not point to a directory. NotPropertiesError: If properties are not correct Frictionless resource properties. + + Examples: + ```{python} + import tempfile + from pathlib import Path + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create a package and resource structure first + sp.create_package_structure(path=temp_path) + sp.create_resource_structure(path=temp_path / "1" / "resources") + + # Create resource properties + sp.create_resource_properties( + path=temp_path / "1" / "resources" / "1", + properties=sp.ResourceProperties( + name="new-resource-name", + path="data.parquet", + ), + ) + ``` """ properties = properties.compact_dict check_is_dir(path) diff --git a/seedcase_sprout/core/create_resource_structure.py b/seedcase_sprout/core/create_resource_structure.py index 711b3615c..2d18187cc 100644 --- a/seedcase_sprout/core/create_resource_structure.py +++ b/seedcase_sprout/core/create_resource_structure.py @@ -28,6 +28,24 @@ def create_resource_structure(path: Path) -> list[Path]: Raises: NotADirectoryError: If path is not an existing directory. + + Examples: + ```{python} + import tempfile + from pathlib import Path + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create a package structure first + sp.create_package_structure(path=temp_path) + + # Create a resource structure + sp.create_resource_structure(path=temp_path / "1" / "resources") + ``` """ check_is_dir(path) diff --git a/seedcase_sprout/core/edit_package_properties.py b/seedcase_sprout/core/edit_package_properties.py index 6dbb1c82c..f8c533220 100644 --- a/seedcase_sprout/core/edit_package_properties.py +++ b/seedcase_sprout/core/edit_package_properties.py @@ -47,6 +47,31 @@ def edit_package_properties( ExceptionGroup: If there is an error in the current, incoming or resulting package properties. A group of `CheckError`s, one error for each failed check. + + Examples: + ```{python} + import tempfile + from pathlib import Path + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + + # Create a package structure first + sp.create_package_structure(path=temp_path) + + # Edit package properties + sp.edit_package_properties( + path=temp_path / "1" / "datapackage.json", + properties=sp.PackageProperties( + title="New Package Title", + name="new-package-name", + description="New Description", + ), + ) + ``` """ properties = properties.compact_dict diff --git a/seedcase_sprout/core/path_package_functions.py b/seedcase_sprout/core/path_package_functions.py index fdb19d7c5..1ff698cea 100644 --- a/seedcase_sprout/core/path_package_functions.py +++ b/seedcase_sprout/core/path_package_functions.py @@ -1,3 +1,9 @@ +"""This module contains functions to get the paths to data packages and their files. + +They are intended to be used in conjunction with other functions to read, write, and +edit the contents and properties of packages. +""" + from pathlib import Path from seedcase_sprout.core.check_is_dir import check_is_dir @@ -15,12 +21,31 @@ def path_package(package_id: int) -> Path: Returns: The absolute path to the specified package. + + Examples: + ```{python} + import os + import tempfile + from pathlib import Path + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Create a package structure first + sp.create_package_structure(path=sp.path_packages()) + + # Get the path to the package + sp.path_package(package_id=1) + ``` """ path = path_packages() / str(package_id) return check_is_package_dir(path) -def path_package_properties(package_id: int) -> Path: +def path_properties(package_id: int) -> Path: """Gets the absolute path to the specified package's properties file. Args: @@ -28,6 +53,24 @@ def path_package_properties(package_id: int) -> Path: Returns: The absolute path to the specified package's properties file. + + Examples: + ```{python} + import os + import tempfile + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Create a package structure first + sp.create_package_structure(path=sp.path_packages()) + + # Get the path to the package properties + sp.path_package_properties(package_id=1) + ``` """ path = path_package(package_id) / "datapackage.json" return check_is_file(path) @@ -41,6 +84,21 @@ def path_packages() -> Path: Raises: NotADirectoryError: If the packages folder doesn't exist. + + Examples: + ```{python} + import os + import tempfile + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Get the path to the packages folder + sp.path_packages() + ``` """ path = path_sprout_global() / "packages" return create_dir(path) if not path.exists() else check_is_dir(path) diff --git a/seedcase_sprout/core/path_resource_functions.py b/seedcase_sprout/core/path_resource_functions.py index a3479bd25..85ca34dd4 100644 --- a/seedcase_sprout/core/path_resource_functions.py +++ b/seedcase_sprout/core/path_resource_functions.py @@ -1,3 +1,9 @@ +"""This module contains functions to get the paths to data resources and their files. + +They are intended to be used in conjunction with other functions to read, write, and +edit the contents and properties of resources. +""" + from pathlib import Path from seedcase_sprout.core import path_package @@ -15,6 +21,25 @@ def path_resource(package_id: int, resource_id: int) -> Path: Returns: The absolute path to the specified resource. + + Examples: + ```{python} + import os + import tempfile + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Create a package and resource structure first + sp.create_package_structure(path=sp.path_packages()) + sp.create_resource_structure(path=sp.path_resources(package_id=1)) + + # Get the path to the resource + sp.path_resource(package_id=1, resource_id=1) + ``` """ path = path_resources(package_id) / str(resource_id) return check_is_resource_dir(path) @@ -29,6 +54,35 @@ def path_resource_data(package_id: int, resource_id: int) -> Path: Returns: The absolute path the specified resource's data file. + + Examples: + ```{python} + import os + import tempfile + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Create a package and resource structure first + sp.create_package_structure(path=sp.path_packages()) + sp.create_resource_structure(path=sp.path_resources(package_id=1)) + + # TODO: Add data to resource + # sp.write_resource_data_to_raw( + # package_id=1, + # resource_id=1, + # data="path/to/data.csv") + + # sp.write_resource_parquet( + # raw_files=sp.path_resource_raw_files(package_id=1, resource_id=1), + # path=sp.path_resource_data(package_id=1, resource_id=1)) + + # Get the path to the resource data + # sp.path_resource_data(package_id=1, resource_id=1) + ``` """ path = path_resource(package_id, resource_id) / "data.parquet" return check_is_file(path) @@ -43,6 +97,25 @@ def path_resource_raw(package_id: int, resource_id: int) -> Path: Returns: The absolute path to the specified resource's raw folder. + + Examples: + ```{python} + import os + import tempfile + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Create a package and resource structure first + sp.create_package_structure(path=sp.path_packages()) + sp.create_resource_structure(path=sp.path_resources(package_id=1)) + + # Get the path to the resource's raw folder + sp.path_resource_raw(package_id=1, resource_id=1) + ``` """ path = path_resource(package_id, resource_id) / "raw" return check_is_dir(path) @@ -61,6 +134,30 @@ def path_resource_raw_files(package_id: int, resource_id: int) -> list[Path]: Raises: NotADirectoryError: If the package_id doesn't exist or the resource_id doesn't exist within the package. + + Examples: + ```{python} + import os + import tempfile + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Create a package and resource structure first + sp.create_package_structure(path=sp.path_packages()) + sp.create_resource_structure(path=sp.path_resources(package_id=1)) + + # TODO: Add data/raw files to resource + # sp.write_resource_data_to_raw( + # path=sp.path_resource_raw(package_id=1, resource_id=1), + # data="path/to/data.csv") + + # Get the path to the resource's raw files + sp.path_resource_raw_files(package_id=1, resource_id=1) + ``` """ return list(path_resource_raw(package_id, resource_id).iterdir()) @@ -73,6 +170,24 @@ def path_resources(package_id: int) -> Path: Returns: The absolute path to the resources within the specified package. + + Examples: + ```{python} + import os + import tempfile + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Create a package structure first + sp.create_package_structure(path=sp.path_packages()) + + # Get the path to the resource's raw files + sp.path_resources(package_id=1) + ``` """ path = path_package(package_id) / "resources" check_is_dir(path) diff --git a/seedcase_sprout/core/path_sprout_global.py b/seedcase_sprout/core/path_sprout_global.py index 8020fe948..bfef2348f 100644 --- a/seedcase_sprout/core/path_sprout_global.py +++ b/seedcase_sprout/core/path_sprout_global.py @@ -9,5 +9,20 @@ def path_sprout_global() -> Path: Returns: The path to Sprout's global directory. + + Examples: + ```{python} + import tempfile + import os + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + with tempfile.TemporaryDirectory() as temp_dir: + os.environ["SPROUT_GLOBAL"] = temp_dir + + # Get the path to Sprout's global directory + sp.path_sprout_global() + ``` """ return get_sprout_global_envvar() or create_sprout_global_path() diff --git a/seedcase_sprout/core/properties.py b/seedcase_sprout/core/properties.py index b481a789a..794a0a762 100644 --- a/seedcase_sprout/core/properties.py +++ b/seedcase_sprout/core/properties.py @@ -1,3 +1,9 @@ +"""This module includes the dataclasses for the properties of a data package. + +The properties are based on the Frictionless Data Package specification. They are used +as input for creating and editing the properties of data packages and data resources and +are intended to help users with the correct structure and content of the properties. +""" # NOTE: This content is modified from the auto-generated # `generate_properties/generated_properties.py` file. Update the auto-generated # properties file to add more dataclasses and move them into this file. @@ -13,22 +19,24 @@ class Properties(ABC): - """An abstract base class for all *Properties classes holding common logic.""" + """An abstract base class for all `*Properties` classes holding common logic.""" @classmethod @abstractmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values.""" + """Creates a dataclass `*Properties` object with default values.""" pass @property def compact_dict(self) -> dict: - """Converts the object to a dictionary, removing any keys with None values. + """Converts the dataclass `*Properties` object to a dictionary. - Applies recursively to nested `*Properties` objects. + Applies recursively to nested `*Properties` objects. Also removes any keys with + None values. Returns: - A dictionary representation of the object with only non-None values. + A dictionary representation of the `*Properties` object with only non-None + values. """ return asdict( obj=self, @@ -39,14 +47,14 @@ def compact_dict(self) -> dict: @classmethod def from_dict(cls: type[Self], data: dict) -> Self: - """Creates an instance populated with data from a dictionary. + """Creates a dataclass `*Properties` object filled with data from a dictionary. Args: - cls: The class to create an instance of. - data: The data to populate the instance with. + cls: The class to create the `*Properties` object from. + data: The data to fill the `*Properties` object with. Returns: - An instance of the class with the properties from the dictionary. + A `*Properties` object with the properties from the dictionary. """ return from_dict(data_class=cls, data=data) @@ -55,6 +63,10 @@ def from_dict(cls: type[Self], data: dict) -> Self: class ContributorProperties(Properties): """The people or organizations who contributed to this data package. + Creates a dataclass object with all the necessary properties for a + contributor. This would be given in the `contributors` field of a + `PackageProperties` object. + Attributes: title (str | None): The name of the contributor. path (str | None): A fully qualified URL pointing to a relevant @@ -68,6 +80,13 @@ class ContributorProperties(Properties): contributor. roles (list[str] | None): An array of strings describing the roles of the contributor. + + Examples: + ```{python} + import seedcase_sprout.core as sp + print(sp.ContributorProperties()) + print(sp.ContributorProperties(title="Amir Smith")) + ``` """ title: str | None = None @@ -80,10 +99,10 @@ class ContributorProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `ContributorProperties` object with default values. Returns: - A ContributorProperties object with default values + A `ContributorProperties` object with default values. """ return cls( title="", @@ -100,11 +119,21 @@ def default(cls: type[Self]) -> Self: class LicenseProperties(Properties): """The license(s) under which the package or resource is provided. + Creates a dataclass object with all the necessary properties for a + license, so that it can be added to the `licenses` field of a + `PackageProperties` object. + Attributes: name (str | None): Must be an Open Definition license identifier, see http://licenses.opendefinition.org/ path (str | None): A fully qualified URL, or a POSIX file path. title (str | None): A human-readable title. + + Examples: + ```{python} + import seedcase_sprout.core as sp + print(sp.LicenseProperties()) + ``` """ name: str | None = None @@ -113,10 +142,10 @@ class LicenseProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `LicenseProperties` object with default values. Returns: - A LicenseProperties object with default values + A `LicenseProperties` object with default values. """ return cls(name="", path="", title="") @@ -139,10 +168,10 @@ class SourceProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `SourceProperties` dataclass with default values. Returns: - A SourceProperties object with default values + A `SourceProperties` object with default values. """ return cls(title="", path="", email="", version="") @@ -215,10 +244,10 @@ class TableDialectProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `TableDialectProperties` dataclass with default values. Returns: - A TableDialectProperties object with default values + A `TableDialectProperties` dataclass with default values. """ return cls( header=True, @@ -259,10 +288,10 @@ class ReferenceProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `ReferenceProperties` dataclass with default values. Returns: - A ReferenceProperties object with default values + A `ReferenceProperties` dataclass with default values. """ return cls(resource="", fields=[]) @@ -287,10 +316,10 @@ class TableSchemaForeignKeyProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `TableSchemaForeignKeyProperties` dataclass with default values. Returns: - A TableSchemaForeignKeyProperties object with default values + A `TableSchemaForeignKeyProperties` dataclass with default values. """ return cls(fields=[]) @@ -309,10 +338,10 @@ class MissingValueProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `MissingValueProperties` dataclass with default values. Returns: - A MissingValueProperties object with default values + A `MissingValueProperties` dataclass with default values. """ return cls(value="", label="") @@ -340,7 +369,11 @@ def default(cls: type[Self]) -> Self: @dataclass class ConstraintsProperties(Properties): - """A class that expresses constraints for validating field values. + """A dataclass that expresses constraints for validating field values. + + A constraint is a rule that dictates the given values, or range of values, + that a variable or column can have in a dataset. For instance, a constraint + for an "age" column could be that it must be greater than 0 but less than 120. Attributes: required (bool | None): Indicates whether a property must have a @@ -381,10 +414,10 @@ class ConstraintsProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `ConstraintsProperties` dataclass with default values. Returns: - A ConstraintsProperties object with default values + A `ConstraintsProperties` dataclass with default values. """ return cls( required=False, @@ -436,10 +469,10 @@ class FieldProperties(Properties): @classmethod def default(cls: "type[Self]") -> Self: - """Creates an instance with default values. + """Creates a `FieldProperties` dataclass with default values. Returns: - A FieldProperties object with default values + A `FieldProperties` dataclass with default values. """ return cls( name="", @@ -483,6 +516,12 @@ class TableSchemaProperties(Properties): table (resource). missing_values (list[str] | list[MissingValueProperties] | None): Values that, when encountered in the source, should be considered as not present. + + Examples: + ```{python} + import seedcase_sprout.core as sp + print(sp.TableSchemaProperties(primary_key="id")) + ``` """ fields: list[FieldProperties] | None = None @@ -494,10 +533,10 @@ class TableSchemaProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `TableSchemaProperties` dataclass with default values. Returns: - A TableSchemaProperties object with default values + A `TableSchemaProperties` dataclass with default values. """ return cls( fields=[], @@ -518,6 +557,10 @@ class ResourceProperties(Properties): the data it describes. A range of other properties can be declared to provide a richer set of metadata. + Creates a dataclass object with all the necessary properties for a resource, + which would be given in the `resources` field of a `PackageProperties` + dataclass. + Attributes: name (str | None): A simple name or identifier to be used for this resource. Should consist only of lowercase English alphanumeric characters plus @@ -542,6 +585,13 @@ class ResourceProperties(Properties): data. schema (TableSchemaProperties | None): A table schema for the resource data, compliant with the table schema specification. + + Examples: + ```{python} + import seedcase_sprout.core as sp + print(sp.ResourceProperties()) + print(sp.ResourceProperties(name="Blood samples", path="data.csv")) + ``` """ name: str | None = None @@ -561,10 +611,10 @@ class ResourceProperties(Properties): @classmethod def default(cls: "type[Self]") -> Self: - """Creates an instance with default values. + """Creates a `ResourcesProperties` dataclass with default values. Returns: - A ResourceProperties object with default values + A `ResourceProperties` dataclass with default values. """ return cls( name="", @@ -584,12 +634,14 @@ def default(cls: "type[Self]") -> Self: @dataclass class PackageProperties(Properties): - """A data package. + """Properties for a data package. A simple container format for describing a coherent collection of data in a single "package". It provides the basis for convenient delivery, installation and management of datasets. + Creates a dataclass object with all the necessary properties for the package. + Attributes: name (str | None): A simple name or identifier to be used for this package. Should consist only of lowercase English alphanumeric characters plus @@ -610,6 +662,26 @@ class PackageProperties(Properties): in this data package, each compliant with the data resource specification. sources (list[SourceProperties] | None): The raw sources for this data package. + + Examples: + ```{python} + import seedcase_sprout.core as sp + print(sp.PackageProperties()) + print(sp.PackageProperties(name="diabetes-cohort", title="Diabetes Cohort")) + print(sp.PackageProperties(licenses=[sp.LicenseProperties(name="ODC-BY-1.0")])) + + # To allow multiline strings, use dedent. + from textwrap import dedent + print(sp.PackageProperties( + title="Birds of North America", + description=dedent(''' + # Markdown header + + A dataset of bird sightings. With some **bolding**. + ''' + ) + )) + ``` """ name: str | None = None @@ -628,10 +700,10 @@ class PackageProperties(Properties): @classmethod def default(cls: type[Self]) -> Self: - """Creates an instance with default values. + """Creates a `PackageProperties` dataclass with default values. Returns: - A PackageProperties object with default values + A `PackageProperties` dataclass with default values. """ return cls( name="", diff --git a/seedcase_sprout/core/write_resource_properties.py b/seedcase_sprout/core/write_resource_properties.py index c40bb281a..eaf57c081 100644 --- a/seedcase_sprout/core/write_resource_properties.py +++ b/seedcase_sprout/core/write_resource_properties.py @@ -31,6 +31,43 @@ def write_resource_properties( NotPropertiesError: If the resource or package properties are not correct, i.e., they are incomplete or don't follow the Data Package specification. JSONDecodeError: If the `datapackage.json` file couldn't be read. + + Examples: + ```{python} + import tempfile + from pathlib import Path + + import seedcase_sprout.core as sp + + # Create a temporary directory for the example + temp_dir = Path(tempfile.TemporaryDirectory().name) + temp_dir.mkdir() + + # Create package and resource structure first + sp.create_package_structure(path=temp_dir) + sp.create_resource_structure(path=temp_dir / "1" / "resources") + + # TODO: Write package properties that passes checks + # Write package properties + # sp.write_package_properties( + # path=temp_dir / "1" / "datapackage.json", + # package_properties=sp.PackageProperties( + # title="New Package Title", + # name="new-package-name", + # description="New Description", + # ), + + # Write resource properties + # sp.write_resource_properties( + # path=temp_dir / "1" / "datapackage.json", + # resource_properties=sp.ResourceProperties( + # name="new-resource-name", + # title="New resource name", + # description="This is a new resource", + # path="data.parquet", + # ), + # ) + ``` """ resource_properties = resource_properties.compact_dict check_is_file(path) diff --git a/tests/core/test_path_package_functions.py b/tests/core/test_path_package_functions.py index 00a2d4fe5..f35617e34 100644 --- a/tests/core/test_path_package_functions.py +++ b/tests/core/test_path_package_functions.py @@ -4,8 +4,8 @@ from seedcase_sprout.core import ( path_package, - path_package_properties, path_packages, + path_properties, ) from tests.core.directory_structure_setup import ( create_test_package_structure, @@ -26,7 +26,7 @@ def tmp_sprout_global(monkeypatch, tmp_path): "function, expected_path", [ (path_package, "packages/1"), - (path_package_properties, "packages/1/datapackage.json"), + (path_properties, "packages/1/datapackage.json"), ], ) def test_path_package_functions_return_expected_path( @@ -39,7 +39,7 @@ def test_path_package_functions_return_expected_path( @mark.parametrize( "function", - [path_package, path_package_properties], + [path_package, path_properties], ) def test_path_package_functions_raise_error_if_package_id_does_not_exist( tmp_sprout_global, function