diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ee62bcb5e..c3cfbbe07 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,11 +5,11 @@ build: tools: python: "3.8" -sphinx: - configuration: docs/source/conf.py - python: install: - requirements: docs/requirements.txt +mkdocs: + fail_on_warning: false + formats: [pdf] \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 9138821e2..281d3b068 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,21 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - The `merlin monitor` command will now keep an allocation up if the queues are empty and workers are still processing tasks - Add the restart keyword to the specification docs +### Changed +- The entire documentation has been ported to MkDocs and re-organized + - *Dark Mode* + - New "Getting Started" example for a simple setup tutorial + - More detail on configuration instructions + - There's now a full page on installation instructions + - More detail on explaining the spec file + - More detail with the CLI page + - New "Running Studies" page to explain different ways to run studies, restart them, and accomplish command line substitution + - New "Interpreting Output" page to help users understand how the output workspace is generated in more detail + - New "Examples" page has been added + - Updated "FAQ" page to include more links to helpful locations throughout the documentation + - Set up a place to store API docs + - New "Contact" page with info on reaching Merlin devs + ## [1.11.1] ### Fixed - Typo in `batch.py` that caused lsf launches to fail (`ALL_SGPUS` changed to `ALL_GPUS`) diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 662696c6f..000000000 --- a/docs/Makefile +++ /dev/null @@ -1,31 +0,0 @@ -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SPHINXPROJ = Merlin -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -code-docs: - sphinx-apidoc -f -o source/ ../merlin/ - -view: code-docs html - firefox -new-instance build/html/index.html - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - pip install -r requirements.txt - echo $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -clean: - rm -rf build/ diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 000000000..cbddc54a4 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,44 @@ +# Guide to Merlin Documentation + +Merlin uses [MkDocs](https://www.mkdocs.org/) to generate its documentation and [Read the Docs](https://about.readthedocs.com/?ref=readthedocs.com) to host it. This README will detail important information on handling the documentation. + +## How to Build the Documentation + +Ensure you're at the root of the Merlin repository: + +```bash +cd /path/to/merlin/ +``` + +Install the documentation with: + +```bash +pip install -r docs/requirements.txt +``` + +Build the documentation with: + +```bash +mkdocs serve +``` + +Once up and running, MkDocs should provide a message telling you where your browser is connected (this is typically `http://127.0.0.1:8000/`). If you're using VSCode, you should be able to `ctrl+click` on the address to open the browser window. An example is shown below: + +```bash +(venv_name) [user@machine:merlin]$ mkdocs serve +INFO - Building documentation... +INFO - Cleaning site directory +WARNING - Excluding 'README.md' from the site because it conflicts with 'index.md'. +WARNING - A relative path to 'api_reference/' is included in the 'nav' configuration, which is not found in the documentation files. +INFO - Documentation built in 3.24 seconds +INFO - [09:16:00] Watching paths for changes: 'docs', 'mkdocs.yml' +INFO - [09:16:00] Serving on http://127.0.0.1:8000/ +``` + +## Configuring the Documentation + +MkDocs relies on an `mkdocs.yml` file for almost everything to do with configuration. See [their Configuration documentation](https://www.mkdocs.org/user-guide/configuration/) for more information. + +## How Do API Docs Work? + +Coming soon... diff --git a/docs/api_reference/index.md b/docs/api_reference/index.md new file mode 100644 index 000000000..6707457de --- /dev/null +++ b/docs/api_reference/index.md @@ -0,0 +1,5 @@ +# Merlin API Reference + +Coming soon! + + diff --git a/docs/assets/images/hello-samples-tree.png b/docs/assets/images/hello-samples-tree.png new file mode 100644 index 000000000..8893874bf Binary files /dev/null and b/docs/assets/images/hello-samples-tree.png differ diff --git a/docs/assets/images/interpreting_output/basic-merlin-info-workspace.png b/docs/assets/images/interpreting_output/basic-merlin-info-workspace.png new file mode 100644 index 000000000..e4e272533 Binary files /dev/null and b/docs/assets/images/interpreting_output/basic-merlin-info-workspace.png differ diff --git a/docs/assets/images/interpreting_output/basic-step-workspace.png b/docs/assets/images/interpreting_output/basic-step-workspace.png new file mode 100644 index 000000000..e3765dbd0 Binary files /dev/null and b/docs/assets/images/interpreting_output/basic-step-workspace.png differ diff --git a/docs/assets/images/interpreting_output/merlin-info-with-samples.png b/docs/assets/images/interpreting_output/merlin-info-with-samples.png new file mode 100644 index 000000000..eafdf2c27 Binary files /dev/null and b/docs/assets/images/interpreting_output/merlin-info-with-samples.png differ diff --git a/docs/assets/images/interpreting_output/modified-hierarchy-structure.png b/docs/assets/images/interpreting_output/modified-hierarchy-structure.png new file mode 100644 index 000000000..24d5002c6 Binary files /dev/null and b/docs/assets/images/interpreting_output/modified-hierarchy-structure.png differ diff --git a/docs/assets/images/interpreting_output/two-level-sample-hierarchy.png b/docs/assets/images/interpreting_output/two-level-sample-hierarchy.png new file mode 100644 index 000000000..7514aa97e Binary files /dev/null and b/docs/assets/images/interpreting_output/two-level-sample-hierarchy.png differ diff --git a/docs/assets/images/interpreting_output/workspace-with-params-and-samples.png b/docs/assets/images/interpreting_output/workspace-with-params-and-samples.png new file mode 100644 index 000000000..3925975b1 Binary files /dev/null and b/docs/assets/images/interpreting_output/workspace-with-params-and-samples.png differ diff --git a/docs/assets/images/interpreting_output/workspace-with-params.png b/docs/assets/images/interpreting_output/workspace-with-params.png new file mode 100644 index 000000000..9315c51f5 Binary files /dev/null and b/docs/assets/images/interpreting_output/workspace-with-params.png differ diff --git a/docs/assets/images/interpreting_output/workspace-with-samples.png b/docs/assets/images/interpreting_output/workspace-with-samples.png new file mode 100644 index 000000000..a204772d0 Binary files /dev/null and b/docs/assets/images/interpreting_output/workspace-with-samples.png differ diff --git a/docs/images/merlin_arch.png b/docs/assets/images/merlin_arch.png similarity index 100% rename from docs/images/merlin_arch.png rename to docs/assets/images/merlin_arch.png diff --git a/docs/images/merlin.png b/docs/assets/images/merlin_banner.png similarity index 100% rename from docs/images/merlin.png rename to docs/assets/images/merlin_banner.png diff --git a/docs/assets/images/merlin_banner_white.png b/docs/assets/images/merlin_banner_white.png new file mode 100644 index 000000000..91cc56d08 Binary files /dev/null and b/docs/assets/images/merlin_banner_white.png differ diff --git a/docs/images/merlin_icon.png b/docs/assets/images/merlin_icon.png similarity index 100% rename from docs/images/merlin_icon.png rename to docs/assets/images/merlin_icon.png diff --git a/docs/assets/images/running_studies/current-node-launch.png b/docs/assets/images/running_studies/current-node-launch.png new file mode 100644 index 000000000..47ffa1561 Binary files /dev/null and b/docs/assets/images/running_studies/current-node-launch.png differ diff --git a/docs/assets/images/running_studies/iterative-diagram.png b/docs/assets/images/running_studies/iterative-diagram.png new file mode 100644 index 000000000..cf763c906 Binary files /dev/null and b/docs/assets/images/running_studies/iterative-diagram.png differ diff --git a/docs/assets/images/running_studies/merlin-run-diagram.png b/docs/assets/images/running_studies/merlin-run-diagram.png new file mode 100644 index 000000000..b0f82e2a1 Binary files /dev/null and b/docs/assets/images/running_studies/merlin-run-diagram.png differ diff --git a/docs/assets/images/running_studies/parallel-launch.png b/docs/assets/images/running_studies/parallel-launch.png new file mode 100644 index 000000000..cb907cde6 Binary files /dev/null and b/docs/assets/images/running_studies/parallel-launch.png differ diff --git a/docs/assets/images/running_studies/producer-consumer-model.png b/docs/assets/images/running_studies/producer-consumer-model.png new file mode 100644 index 000000000..a1bda7c11 Binary files /dev/null and b/docs/assets/images/running_studies/producer-consumer-model.png differ diff --git a/docs/assets/images/running_studies/worker-server-communication.png b/docs/assets/images/running_studies/worker-server-communication.png new file mode 100644 index 000000000..86da1b99f Binary files /dev/null and b/docs/assets/images/running_studies/worker-server-communication.png differ diff --git a/docs/source/modules/advanced_topics/cumulative_results.png b/docs/assets/images/tutorial/advanced_topics/cumulative_results.png similarity index 100% rename from docs/source/modules/advanced_topics/cumulative_results.png rename to docs/assets/images/tutorial/advanced_topics/cumulative_results.png diff --git a/docs/source/modules/hello_world/dag1.png b/docs/assets/images/tutorial/hello_world/dag1.png similarity index 100% rename from docs/source/modules/hello_world/dag1.png rename to docs/assets/images/tutorial/hello_world/dag1.png diff --git a/docs/source/modules/hello_world/dag2.png b/docs/assets/images/tutorial/hello_world/dag2.png similarity index 100% rename from docs/source/modules/hello_world/dag2.png rename to docs/assets/images/tutorial/hello_world/dag2.png diff --git a/docs/source/modules/hello_world/dag3.png b/docs/assets/images/tutorial/hello_world/dag3.png similarity index 100% rename from docs/source/modules/hello_world/dag3.png rename to docs/assets/images/tutorial/hello_world/dag3.png diff --git a/docs/source/modules/hello_world/dag4.png b/docs/assets/images/tutorial/hello_world/dag4.png similarity index 100% rename from docs/source/modules/hello_world/dag4.png rename to docs/assets/images/tutorial/hello_world/dag4.png diff --git a/docs/source/modules/hello_world/merlin_output.png b/docs/assets/images/tutorial/hello_world/merlin_output.png similarity index 100% rename from docs/source/modules/hello_world/merlin_output.png rename to docs/assets/images/tutorial/hello_world/merlin_output.png diff --git a/docs/source/modules/hello_world/merlin_output2.png b/docs/assets/images/tutorial/hello_world/merlin_output2.png similarity index 100% rename from docs/source/modules/hello_world/merlin_output2.png rename to docs/assets/images/tutorial/hello_world/merlin_output2.png diff --git a/docs/images/central_coordination.png b/docs/assets/images/tutorial/introduction/central_coordination.png similarity index 100% rename from docs/images/central_coordination.png rename to docs/assets/images/tutorial/introduction/central_coordination.png diff --git a/docs/images/external_coordination.png b/docs/assets/images/tutorial/introduction/external_coordination.png similarity index 100% rename from docs/images/external_coordination.png rename to docs/assets/images/tutorial/introduction/external_coordination.png diff --git a/docs/images/internal_coordination.png b/docs/assets/images/tutorial/introduction/internal_coordination.png similarity index 100% rename from docs/images/internal_coordination.png rename to docs/assets/images/tutorial/introduction/internal_coordination.png diff --git a/docs/images/merlin_run.png b/docs/assets/images/tutorial/introduction/merlin_run.png similarity index 100% rename from docs/images/merlin_run.png rename to docs/assets/images/tutorial/introduction/merlin_run.png diff --git a/docs/images/task_creation_rate.png b/docs/assets/images/tutorial/introduction/task_creation_rate.png similarity index 100% rename from docs/images/task_creation_rate.png rename to docs/assets/images/tutorial/introduction/task_creation_rate.png diff --git a/docs/source/modules/run_simulation/lid-driven-stable.png b/docs/assets/images/tutorial/run_simulation/lid-driven-stable.png similarity index 100% rename from docs/source/modules/run_simulation/lid-driven-stable.png rename to docs/assets/images/tutorial/run_simulation/lid-driven-stable.png diff --git a/docs/source/modules/run_simulation/openfoam_dag.png b/docs/assets/images/tutorial/run_simulation/openfoam_dag.png similarity index 100% rename from docs/source/modules/run_simulation/openfoam_dag.png rename to docs/assets/images/tutorial/run_simulation/openfoam_dag.png diff --git a/docs/source/modules/run_simulation/openfoam_wf_output.png b/docs/assets/images/tutorial/run_simulation/openfoam_wf_output.png similarity index 100% rename from docs/source/modules/run_simulation/openfoam_wf_output.png rename to docs/assets/images/tutorial/run_simulation/openfoam_wf_output.png diff --git a/docs/source/modules/run_simulation/prediction.png b/docs/assets/images/tutorial/run_simulation/prediction.png similarity index 100% rename from docs/source/modules/run_simulation/prediction.png rename to docs/assets/images/tutorial/run_simulation/prediction.png diff --git a/docs/source/modules/run_simulation/setup.png b/docs/assets/images/tutorial/run_simulation/setup.png similarity index 100% rename from docs/source/modules/run_simulation/setup.png rename to docs/assets/images/tutorial/run_simulation/setup.png diff --git a/docs/assets/javascripts/swap_lp_image.js b/docs/assets/javascripts/swap_lp_image.js new file mode 100644 index 000000000..f809bb505 --- /dev/null +++ b/docs/assets/javascripts/swap_lp_image.js @@ -0,0 +1,26 @@ +/* + * This script is for swapping the landing page image between + * one with black text (for light mode) and one with white + * text (for dark mode). + */ + +function swapLandingPageImage() { + // Get the current theme (should be "slate" or "default") + const colorSchemeValue = document.body.getAttribute("data-md-color-scheme"); + + // Get the image element + const imageElement = document.getElementById("landing-page-image"); + + // Paths for light/dark mode images + const lightModeImgPath = "assets/images/merlin_banner.png"; + const darkModeImgPath = "assets/images/merlin_banner_white.png"; + + // Set the image source based on the color scheme + imageElement.src = colorSchemeValue == "slate" ? darkModeImgPath : lightModeImgPath; +} + +// Set up an observer to watch for theme changes +const observer = new MutationObserver(swapLandingPageImage); +const targetNode = document.body; +const config = {attributes: true, childList: true}; +observer.observe(targetNode, config); \ No newline at end of file diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css new file mode 100644 index 000000000..0001866e9 --- /dev/null +++ b/docs/assets/stylesheets/extra.css @@ -0,0 +1,8 @@ +[data-md-color-scheme=slate] { + --md-default-bg-color: hsla(var(--md-hue),15%,22%,1); +} + +[data-md-color-primary=black] .md-button--primary { + background-color: hsla(202, 97%, 34%, 1); /* This is the same as #036BA9 (the hex color of Merlin's hat logo) */ + border-color: hsla(202, 97%, 34%, 1); +} diff --git a/docs/contact.md b/docs/contact.md new file mode 100644 index 000000000..bb543007f --- /dev/null +++ b/docs/contact.md @@ -0,0 +1,21 @@ +--- +hide: + - navigation +--- + +# Contact Us + +!!! question "Have a Question?" + + :material-email: [merlin@llnl.gov](mailto:merlin@llnl.gov) + + :simple-microsoftteams: LC Teams Channels: + + - [Merlin Support Page](https://teams.microsoft.com/l/team/19%3a8efc6658663c46aca4bfca4f11df7eb3%40thread.skype/conversations?groupId=7c7f88bc-2485-460f-90e2-4f6880b5ed3e&tenantId=a722dec9-ae4e-4ae3-9d75-fd66e2680a63) + + - [WEAVE's Merlin Support Page](https://teams.microsoft.com/l/channel/19%3aa677f3ebd1d9463b9128e9609a5b4177%40thread.tacv2/Merlin?groupId=a56ae26c-fe55-4e8e-8773-dd7975f648a4&tenantId=a722dec9-ae4e-4ae3-9d75-fd66e2680a63) + + +!!! note "Submit an Issue" + + :simple-github: [Report an Issue on GitHub](https://github.com/LLNL/merlin/issues) \ No newline at end of file diff --git a/docs/examples/feature_demo.md b/docs/examples/feature_demo.md new file mode 100644 index 000000000..0c2d5d2b1 --- /dev/null +++ b/docs/examples/feature_demo.md @@ -0,0 +1,3 @@ +# Feature Demo Examples + +Coming soon! \ No newline at end of file diff --git a/docs/examples/flux.md b/docs/examples/flux.md new file mode 100644 index 000000000..7cfff0dfa --- /dev/null +++ b/docs/examples/flux.md @@ -0,0 +1,3 @@ +# Flux Examples + +Coming soon! \ No newline at end of file diff --git a/docs/examples/hello.md b/docs/examples/hello.md new file mode 100644 index 000000000..d981e0b83 --- /dev/null +++ b/docs/examples/hello.md @@ -0,0 +1,734 @@ +# Hello World Examples + +In this page we'll cover everything related to the [Hello](#the-hello-example) and [Hello Samples](#the-hello-samples-example) examples. You can obtain all of the files related to these examples by running: + +=== "hello" + ```bash + merlin example hello + ``` + +=== "hello samples" + ``` + merlin example hello_samples + ``` + +After running either of those commands, a folder called `hello/` should be created which contains all of the files related to these examples. The directory structure should look like: + +```bash + hello + |-- hello_samples.yaml + |-- hello.yaml + |-- make_samples.py + |-- my_hello.yaml + `-- requirements.txt +``` + +Here, `hello.yaml` is the spec file for a simple "hello world" workflow. Similarly, `hello_samples.yaml` also represents a "hello world" workflow but it adds additional complexity by introducing samples. The last workflow here called `my_hello.yaml` is a skeleton workflow that is intended to be filled out by following [Step 3 of the Tutorial](../tutorial/3_hello_world.md). + +## The Hello Example + +The hello example is a simple workflow that runs an echo command two times in order to say hello in two different languages. We can break this example down by looking at each block in the spec file. + +### Spec Breakdown + +For this example, we'll only need three blocks for our spec file: `description`, `global.parameters`, and `study`. Let's dive into what each of these blocks are. + +#### Block: `description` + +The `description` block always contains the `name` and a `description` of your study. This block is required in Merlin so you'll need it in every spec file you create. + +```yaml +description: + name: hello + description: a very simple merlin workflow +``` + +#### Block: `global.parameters` + +!!! info + + You can replace the `global.parameters` block by using the Parameter Generator (pgen) functionality. You can see the [Feature Demo](./feature_demo.md) for an instance of this or you can view [Maestro's Documentation on pgen](https://maestrowf.readthedocs.io/en/latest/Maestro/parameter_specification.html#parameter-generator-pgen) for more information on this feature. + +The `global.parameters` block is where we place all of the parameters we'd like to vary in a study. Here we have two parameters named `GREET` and `WORLD` that each contain two values and a label. It's important to note that each variable must contain the same number of values. + +The %% format used with each label will correspond to the values as they're processed. For instance, on the first iteration of any step using these parameters, the labels will be `GREET.hello` and `WORLD.world`. Then on the second iteration they'd change to `GREET.hola` and `WORLD.mundo`. + +```yaml +global.parameters: + GREET: + values : ["hello","hola"] + label : GREET.%% + WORLD: + values : ["world","mundo"] + label : WORLD.%% +``` + +#### Block: `study` + +The final block to look at here is the `study` block, which contains the definitions of how to run each step in our workflow. + +Each step is denoted by a hyphen and must contain 3 keys: `name`, `description`, and `run`. The `name` and `description` keys always represent the name and description of the step. Here, the `run` key defines the command to run for this step, as well as any steps that this step depends on and the shell to use for running this step. + +When Merlin reads in this block, it will generate a directed acyclic graph (DAG) by interpreting which steps depend on which other steps. This will determine the order that steps are ran. + +```yaml +study: + - name: step_1 + description: say hello + run: + cmd: echo "$(GREET), $(WORLD)!" + + - name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1_*] + shell: /usr/bin/env python3 +``` + +In `step_1`, we provide a command to run that will echo a message based on the parameters we defined in the `global.parameters` block. Since there are two values in each parameter (i.e. 2 parameter sets), this step will be run twice in total. + +In `step_2`, we show that you don't have to use `/bin/bash` as your shell for a step. Here, we choose to use Python 3 as our shell instead. Notice we now have to use `print` rather than `echo` to achieve essentially the same functionality as in `step_1`. + +In `step_2` we also provide another key `depends` that will tell Merlin to not run this step until the previous step(s) have finished. Notice how we add the `_*` to `step_1` here. What could that be doing? Well, if we omitted this and instead said `depends: [step_1]`, we'd be telling `step_2` to run for each parameter set, which in this case seems redundant. If you're more of a visual learner then what this means is that the DAG generated by our step definitions would be: + +
+ ![2-Step DAG without _*](../assets/images/tutorial/hello_world/dag2.png) +
2-Step DAG without "_*"
+
+ +By adding on the `_*` we're telling `step_2` to wait for both parameterized versions of `step_1` to complete before running. Our DAG now funnels so that `step_2` is only ran once: + +
+ ![2-Step DAG with _*](../assets/images/tutorial/hello_world/dag3.png) +
2-Step DAG with "_*"
+
+ +#### The Full Spec + +By combining the three blocks discussed above, our full spec file becomes: + +???+ abstract "Full Hello Spec" + + ```yaml title="hello.yaml" + description: + name: hello + description: a very simple merlin workflow + + global.parameters: + GREET: + values : ["hello","hola"] + label : GREET.%% + WORLD: + values : ["world","mundo"] + label : WORLD.%% + + study: + - name: step_1 + description: say hello + run: + cmd: echo "$(GREET), $(WORLD)!" + + - name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1_*] + shell: /usr/bin/env python3 + ``` + +### Running the Study + +When running a study we can either run it locally or in a distributed manner. + +#### Running Locally + +To run the study locally, we can use the command: + +```bash +merlin run --local hello.yaml +``` + +If everything ran properly, your output at the command line will look something like this: + +???+ success + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-19 17:41:02: INFO] Loading specification from path: /path/to/hello.yaml + [2023-12-19 17:41:02: WARNING] Workflow specification missing + encouraged 'merlin' section! Run 'merlin example' for examples. + Using default configuration with no sampling. + [2023-12-19 17:41:02: INFO] OUTPUT_PATH: hello + [2023-12-19 17:41:02: INFO] Study workspace is '/path/to/hello_20231219-174102'. + [2023-12-19 17:41:02: INFO] Reading app config from file /path/to/.merlin/app.yaml + [2023-12-19 17:41:02: INFO] Overriding default celery config with 'celery.override' in 'app.yaml': + visibility_timeout: 86400 + [2023-12-19 17:41:02: INFO] Calculating task groupings from DAG. + [2023-12-19 17:41:02: INFO] Converting graph to tasks. + [2023-12-19 17:41:02: INFO] Launching tasks. + WARNING:celery.backends.redis: + Setting ssl_cert_reqs=CERT_NONE when connecting to redis means that celery will not validate the identity of the redis broker when connecting. This leaves you vulnerable to man in the middle attacks. + + [2023-12-19 17:41:02: INFO] Executing step 'step_1_GREET.hello.WORLD.world' in '/path/to/hello_20231219-174102/step_1/GREET.hello.WORLD.world'... + [2023-12-19 17:41:02: INFO] Execution returned status OK. + [2023-12-19 17:41:02: INFO] Step 'step_1_GREET.hello.WORLD.world' in '/path/to/hello_20231219-174102/step_1/GREET.hello.WORLD.world' finished successfully. + [2023-12-19 17:41:02: INFO] Executing step 'step_1_GREET.hola.WORLD.mundo' in '/path/to/hello_20231219-174102/step_1/GREET.hola.WORLD.mundo'... + [2023-12-19 17:41:02: INFO] Execution returned status OK. + [2023-12-19 17:41:02: INFO] Step 'step_1_GREET.hola.WORLD.mundo' in '/path/to/hello_20231219-174102/step_1/GREET.hola.WORLD.mundo' finished successfully. + [2023-12-19 17:41:02: INFO] Executing step 'step_2' in '/path/to/hello_20231219-174102/step_2'... + [2023-12-19 17:41:02: INFO] Execution returned status OK. + [2023-12-19 17:41:02: INFO] Step 'step_2' in '/path/to/hello_20231219-174102/step_2' finished successfully. + ``` + +See the [Expected Output](#expected-output) section below for details on the file tree that was created by running this. + +#### Running in a Distributed Manner + +To run your workflow in a distributed manner, first make sure running `merlin info` doesn't show any errors. If it does, you'll need to setup your [Configuration](../user_guide/configuration/index.md). + +Once your configuration is setup properly, we'll need to accomplish two things to run our study: sending tasks to the external server and starting workers to manage and run those tasks. Let's send the tasks to the server first: + +```bash +merlin run hello.yaml +``` + +This command will process the steps defined in the `study` block in order to create the DAG that will determine the order that steps need to run in. Additionally, as mentioned before, it will queue tasks in the central server that our workers will pull from once we start them. If this ran properly you should see similar output to this: + +???+ success "Output From Sending Tasks to the Server" + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-19 17:45:36: INFO] Loading specification from path: /path/to/hello.yaml + [2023-12-19 17:45:36: WARNING] Workflow specification missing + encouraged 'merlin' section! Run 'merlin example' for examples. + Using default configuration with no sampling. + [2023-12-19 17:45:36: INFO] OUTPUT_PATH: hello + [2023-12-19 17:45:36: INFO] Study workspace is '/path/to/hello_20231219-174536'. + [2023-12-19 17:45:36: INFO] Reading app config from file /path/to/.merlin/app.yaml + [2023-12-19 17:45:36: INFO] Overriding default celery config with 'celery.override' in 'app.yaml': + visibility_timeout: 86400 + [2023-12-19 17:45:36: INFO] Calculating task groupings from DAG. + [2023-12-19 17:45:36: INFO] Converting graph to tasks. + [2023-12-19 17:45:36: INFO] Launching tasks. + WARNING:celery.backends.redis: + Setting ssl_cert_reqs=CERT_NONE when connecting to redis means that celery will not validate the identity of the redis broker when connecting. This leaves you vulnerable to man in the middle attacks. + ``` + +Now that our tasks are queued we need to start a worker to complete them. First, open a new terminal window. When we start the workers the process will run continuously until we tell the workers to stop. In order to be able to run commands to interact with these workers, we'll need to have two terminal windows: one with the workers running and another for whatever else we need to do. + +Once we run the next command and the worker spins up, it will immediately start processing the tasks that we sent to the queue with `merlin run`. Let's start the workers now: + +```bash +merlin run-workers hello.yaml +``` + +If this ran properly, you should see the following output plus additional log statements from celery to show that tasks are being processed: + +???+ success "Output From Running Workers" + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-19 17:46:46: INFO] Loading specification from path: /path/to/hello.yaml + [2023-12-19 17:46:46: WARNING] Workflow specification missing + encouraged 'merlin' section! Run 'merlin example' for examples. + Using default configuration with no sampling. + [2023-12-19 17:46:46: INFO] Launching workers from '/path/to/hello.yaml' + [2023-12-19 17:46:46: INFO] Starting workers + [2023-12-19 17:46:46: INFO] Reading app config from file /path/to/.merlin/app.yaml + + + -------------- celery@worker_name.%machine770 v5.3.4 (emerald-rush) + --- ***** ----- + -- ******* ---- Linux-4.18.0-513.9.1.1toss.t4.x86_64-x86_64-with-glibc2.28 2023-12-19 17:46:49 + - *** --- * --- + - ** ---------- [config] + - ** ---------- .> app: merlin:0x2aaab20619e8 + - ** ---------- .> transport: amqps://user:**@server:5671//user + - ** ---------- .> results: redis://user:**@server:6379/0 + - *** --- * --- .> concurrency: 36 (prefork) + -- ******* ---- .> task events: OFF (enable -E to monitor tasks in this worker) + --- ***** ----- + -------------- [queues] + .> [merlin]_merlin exchange=[merlin]_merlin(direct) key=[merlin]_merlin + + + [tasks] + . merlin.common.tasks.add_merlin_expanded_chain_to_chord + . merlin.common.tasks.expand_tasks_with_samples + . merlin.common.tasks.merlin_step + . merlin:chordfinisher + . merlin:queue_merlin_study + . merlin:shutdown_workers + + [2023-12-19 17:46:47,549: INFO] Connected to amqps://user:**@server:5671//user + [2023-12-19 17:46:47,599: INFO] mingle: searching for neighbors + [2023-12-19 17:46:48,807: INFO] mingle: sync with 2 nodes + [2023-12-19 17:46:48,807: INFO] mingle: sync complete + [2023-12-19 17:46:48,835: INFO] celery@worker_name.%machine770 ready. + ``` + +After all of the tasks in the workflow finish processing it will look like the celery workers are still up and running, and that's because they are. However, that doesn't mean that they're still processing tasks. To check if there are still tasks running, you can use the `status` command: + +```bash +merlin status hello.yaml +``` + +If there are no tasks in the queues then our workers are done processing this study and are ready to be stopped: + +```bash +merlin stop-workers +``` + +!!! note + + This is always necessary unless you'd like to keep your celery workers alive and listening for tasks constantly. + +### Expected Output + +Running a study will always produce an output directory containing the commands that were run in each step and the outputs produced. To view the entire output directory structure you can use: + +```bash +tree hello_ +``` + +The result should look like so: + +!!! success "Successful Directory Structure" + + ```bash + hello_/ + |-- merlin_info + | |-- hello.expanded.yaml + | |-- hello.orig.yaml + | `-- hello.partial.yaml + |-- step_1 + | |-- GREET.hello.WORLD.world + | | |-- MERLIN_FINISHED + | | |-- step_1_GREET.hello.WORLD.world.err + | | |-- step_1_GREET.hello.WORLD.world.out + | | `-- step_1_GREET.hello.WORLD.world.sh + | `-- GREET.hola.WORLD.mundo + | |-- MERLIN_FINISHED + | |-- step_1_GREET.hola.WORLD.mundo.err + | |-- step_1_GREET.hola.WORLD.mundo.out + | `-- step_1_GREET.hola.WORLD.mundo.sh + `-- step_2 + |-- MERLIN_FINISHED + |-- step_2.err + |-- step_2.out + `-- step_2.sh + ``` + +The `merlin_info` directory contains three different forms of the spec file we provided: + +1. `hello.expanded.yaml`: The entire spec with all the variables expanded +2. `hello.orig.yaml`: An exact copy of the spec we ran +3. `hello.partial.yaml`: A copy of the original spec plus all of the default values for each block that Merlin will fill in for you if omitted. + +In the `step_1` directory notice that a subdirectory is created for each parameter set. Within each parameter set, and similarly for `step_2`, you'll find four files: + +- A `MERLIN_FINISHED` file which is created at the time the step completes. If this file isn't there then either your step hasn't finished or there was a problem running this step and you should check the `.err` file. +- A `.sh` file containing the command that was run for this step +- A `.out` file containing the stdout generated from the command +- A `.err` file containing any errors generated from the command (hopefully none). This is one of the most useful places to look for debugging. + +Let's check the outputs from our study: + +```bash +cat hello_/step_1/*/*.out hello_/step_2/*.out +``` + +If everything ran properly, the output should look like so: + +!!! success + + ```bash + hello, world! + hola, mundo! + Hurrah, we did it! + ``` + +Congratulations you've successfully run a Merlin study! The next section will cover how to add samples to this example. + +## The Hello Samples Example + +!!! note + + Before running this example, make sure to install the requirements with: + + ```bash + pip install -r requirements.txt + ``` + +The `hello samples` example is very similar to the `hello` example discussed above, but it has one key difference: the use of samples. While parameters are static, samples are generated dynamically and can be more complex data types. + +For this example, instead of echoing variations of "hello world", this time we'll say hello to a couple friends! We'll generate our friends' names (our samples) using the `make_samples.py` script. + +### The Sample Generation Script + +The `make_samples.py` script will be used in this example to generate a list of sample names and write them to a csv file for use in our workflow. This section will serve as a section-by-section breakdown of what this script entails. + +```py title="generate_simulation_inputs.py" linenums="1" +import argparse + +import names +import numpy as np +``` + +This first section is just handling the imports of the libraries that we'll need. + +- The `argparse` library will handle how we pass arguments to this script + +- The `names` library will be what we use to generate names + +- The `numpy` library will be used to randomize our selection of generated names. We'll alias this library as `np` which is standard practice. + +```py linenums="7" +# argument parsing +parser = argparse.ArgumentParser(description="Make some samples (names of people).") +parser.add_argument("--number", type=int, action="store", help="the number of samples you want to make") +parser.add_argument("--filepath", type=str, help="output file") +args = parser.parse_args() +``` + +The second section is where we utilize the `argparse` library to set up our arguments. The arguments to this script will be `--number` to represent the number of names to generate and `--filepath` to set the name of the file to store these samples in. This will make the usage of this script become: + +```bash +python make_samples.py [-h] [--number NUMBER] [--filepath FILEPATH] +``` + +```py linenums="13" +# sample making +all_names = np.loadtxt(names.FILES["first:female"], dtype=str, usecols=0) +selected_names = np.random.choice(all_names, size=args.number) +``` + +The third section is where we use the `names` and `numpy` libraries to generate our sample names and randomly select them. + +```py linenums="17" +result = "" +name_list = list(selected_names) +result = "\n".join(name_list) + +with open(args.filepath, "w") as f: + f.write(result) +``` + +The final section is where we concatenate the names that we've selected into a string where one name is on each line. We then take this string and write it to our file. + +### Spec Breakdown + +There are 3 big differences between this spec and the spec used for the hello example: + +1. Switching the `WORLD` variable from a parameter to a column label +2. The addition of the `env` block +3. The addition of the `merlin` block + +We need to remove the `WORLD` parameter from the `global.paremeters` block here since we're going to make it a sample column label instead. Since our samples here are randomly-generated names, the value of `WORLD` will go from being "world" or "mundo" to a name. + +Removing the `WORLD` parameter makes our `global.parameters` block become: + +```yaml +global.parameters: + GREET: + values : ["hello","hola"] + label : GREET.%% +``` + +Now let's take a look at the two new blocks that we're introducing: `env` and `merlin`. + +#### Block: `env` + +The `env` block is where we define values that will be substituted into the workflow. In the case of this example we define one variable `N_SAMPLES` to represent the number of samples to generate. + +```yaml +env: + variables: + N_SAMPLES: 3 +``` + +#### Block: `merlin` + +The `merlin` block is where we can define sample generation and celery worker specifics. Each sample that we generate and use will be run for each parameter set (if used together). In this example we'll just use the default worker, so we can ignore the celery worker specifics and just focus on sample generation: + +```yaml +merlin: + samples: + generate: + cmd: python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [WORLD] +``` + +In the command to generate the samples, we run our Python script and provide it with an output filepath to store the samples we generate. Additionally, we hand it the number of samples to generate. We then use the `file` key to tell Merlin where the generated samples will exist and the `column_labels` key to label the samples we just created. + +!!! note + + For simplicity we left the column label as `WORLD`, just like in [The Hello Example](#the-hello-example). Best practice would likely be to rename this `NAME` and modify the `$(WORLD)` statement in `step_1` to be `$(NAME)`, but we'll keep it as `WORLD` here. + +The `SPECROOT` and `MERLIN_INFO` variables are reserved variables in Merlin. To see more about them, consult the [Reserved Variables](../user_guide/variables.md#reserved-variables) section of the user guide. + +#### The DAG and Full Spec + +The introduction of samples will make the DAG generated by this workflow be slightly more complex than the `hello` example. Every sample that's generated in Merlin will run for each parameter set. So, since we have one parameter `GREET` with two values `hello` and `hola` (two parameter sets), and three sample names, we'll get six different runs of `step_1`: + +
+ ![DAG With Samples](../assets/images/tutorial/hello_world/dag4.png) +
DAG With Samples
+
+ +With the modifications to the `global.parameters` block and the additions of the `env` and `merlin` blocks the full `hello_samples.yaml` spec becomes: + +???+ abstract "Full Hello Samples Spec" + + ```yaml title="hello_samples.yaml" + description: + name: hello_samples + description: a very simple merlin workflow, with samples + + env: + variables: + N_SAMPLES: 3 + + global.parameters: + GREET: + values : ["hello","hola"] + label : GREET.%% + + study: + - name: step_1 + description: say hello + run: + cmd: echo "$(GREET), $(WORLD)!" + + - name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1_*] + shell: /usr/bin/env python3 + + merlin: + samples: + generate: + cmd: python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [WORLD] + ``` + +### Running the Study + +As with the `hello` example (and any Merlin workflow), this study can be ran locally. However, here we'll be running in a distributed manner as that's the most common way to run Merlin workflows. Just as with the `hello` example, we'll need to queue up our tasks on the server and start up workers to manage these tasks. + +To queue up the tasks: + +```bash +merlin run hello_samples.yaml +``` + +If this ran successfully, you should see the following output: + +???+ success "Output From Sending Tasks to the Server" + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-07 09:26:35: INFO] Loading specification from path: /hello/hello_samples.yaml + [2023-12-07 09:26:35: INFO] OUTPUT_PATH: hello + [2023-12-07 09:26:35: INFO] Study workspace is '/hello/hello_samples_'. + [2023-12-07 09:26:35: INFO] Reading app config from file /app.yaml + [2023-12-07 09:26:35: INFO] Overriding default celery config with 'celery.override' in 'app.yaml': + visibility_timeout: 86400 + [2023-12-07 09:26:36: INFO] Generating samples... + [2023-12-07 09:26:36: INFO] Generating samples complete! + [2023-12-07 09:26:36: INFO] Loading samples from 'samples.csv'... + [2023-12-07 09:26:36: INFO] 3 samples loaded. + [2023-12-07 09:26:36: INFO] Calculating task groupings from DAG. + [2023-12-07 09:26:36: INFO] Converting graph to tasks. + [2023-12-07 09:26:36: INFO] Launching tasks. + WARNING:celery.backends.redis: + Setting ssl_cert_reqs=CERT_NONE when connecting to redis means that celery will not validate the identity of the redis broker when connecting. This leaves you vulnerable to man in the middle attacks. + ``` + +Now, just like with the `hello` example, we'll want to make sure a second terminal window is open for when we start our workers. Once you have a second window open, we can start the workers with: + +```bash +merlin run-workers hello_samples.yaml +``` + +The workers will immediately start processing the tasks that we sent to the server with the `merlin run` command. + +Once successfully ran you should see ouptut similar to the following with some additional messages from Celery about task execution: + +???+ success "Output From Running Workers" + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-07 10:21:30: INFO] Loading specification from path: /hello/hello_samples.yaml + [2023-12-07 10:21:30: INFO] Launching workers from '/hello/hello_samples.yaml' + [2023-12-07 10:21:30: INFO] Starting workers + [2023-12-07 10:21:30: INFO] Reading app config from file /app.yaml + [2023-12-07 10:21:33,446: WARNING] Setting ssl_cert_reqs=CERT_NONE when connecting to redis means that celery will not validate the identity of the redis broker when connecting. This leaves you vulnerable to man in the middle attacks. + + + -------------- celery@default_worker.%ruby965 v5.3.4 (emerald-rush) + --- ***** ----- + -- ******* ---- Linux-4.18.0-477.27.1.1toss.t4.x86_64-x86_64-with-glibc2.28 2023-12-07 10:21:33 + - *** --- * --- + - ** ---------- [config] + - ** ---------- .> app: merlin:0x15553ee391e0 + - ** ---------- .> transport: amqps://:**@:/ + - ** ---------- .> results: rediss://:**@:/ + - *** --- * --- .> concurrency: 56 (prefork) + -- ******* ---- .> task events: OFF (enable -E to monitor tasks in this worker) + --- ***** ----- + -------------- [queues] + .> [merlin]_merlin exchange=[merlin]_merlin(direct) key=[merlin]_merlin + + + [tasks] + . merlin.common.tasks.add_merlin_expanded_chain_to_chord + . merlin.common.tasks.expand_tasks_with_samples + . merlin.common.tasks.merlin_step + . merlin:chordfinisher + . merlin:queue_merlin_study + . merlin:shutdown_workers + ``` + +After all of the tasks in the workflow finish processing it will look like the celery workers are still up and running, and that's because they are. However, that doesn't mean that they're still processing tasks. To check if there are still tasks running, you can use the `status` command: + +```bash +merlin status hello.yaml +``` + +If there are no tasks in the queues then our workers are done processing this study and are ready to be stopped: + +```bash +merlin stop-workers +``` + +!!! note + + This is always necessary unless you'd like to keep your celery workers alive and listening for tasks constantly. + +### Expected Output + +Since we introduced samples in this example, the output directory structure will contain slightly more files than the `hello` example. The differences will be shown in `merlin_info/` and `step_1/`; `step_2` was not modified and therefore will have the same behavior as in the `hello` example. + +Let's start by investigating the differences in the `merlin_info/` directory: + +```bash +tree hello_samples_/merlin_info/ +``` + +If everything ran properly, you should see an output directory structure like so: + +!!! success "Successful `merlin_info/` Directory Structure" + + ```bash + hello_samples_/merlin_info/ + ├── cmd.err + ├── cmd.out + ├── cmd.sh + ├── hello_samples.expanded.yaml + ├── hello_samples.orig.yaml + ├── hello_samples.partial.yaml + └── samples.csv + ``` + +Similarly to the `hello` example from before, we have the 3 forms of our spec file in the `merlin_info/` directory. However, now we have 4 new files: + +1. `cmd.sh`: This is the file containing the bash commands necessary for generating our samples. The contents of this file come directly from the `samples` section of the `merlin` block in our spec file. +2. `cmd.out`: This file will contain any stdout generated by the `cmd.sh` script. +3. `cmd.err`: This file will contain any stderr generated by the `cmd.sh` script (hopefully none). If your workflow had trouble generating samples then this should be the first place you look for more information on why an error occurred. +4. `samples.csv`: This is the file containing the samples that we generated. We provided this file name in the `samples` section of the `merlin` block in our spec file. + +Now let's take a look at the `step_1` directory. If successfully ran, the directory should look like so: + +
+ ![Successful Step 1 Directory Structure](../assets/images/tutorial/hello_world/merlin_output2.png) +
Successful Step 1 Directory Structure
+
+ +Numerically-named directories like `00/`, `01/`, and `02/` are sample directories. Notice that each sample ran for both of our parameter sets. In other words, the parameter set where `GREET=hello` and the parameter set where `GREET=hola` both ran all 3 samples that we generated. + +Finally, let's check the outputs of our study: + +```bash +cat hello_samples_/step_1/*/*/*.out hello_/step_2/*.out +``` + +The outputs should look similar to this (the names presented here will vary): + +!!! success + + ```bash + hello, TILDA! + hello, EUN! + hello, LOVE! + hola, TILDA! + hola, EUN! + hola, LOVE! + Hurrah, we did it! + ``` + +Congratulations you've successfully run a Merlin study using samples! See the other examples for more features that Merlin offers. diff --git a/docs/examples/hpc.md b/docs/examples/hpc.md new file mode 100644 index 000000000..4b8cd069b --- /dev/null +++ b/docs/examples/hpc.md @@ -0,0 +1,3 @@ +# HPC Examples + +Coming soon! \ No newline at end of file diff --git a/docs/examples/index.md b/docs/examples/index.md new file mode 100644 index 000000000..00d4bf237 --- /dev/null +++ b/docs/examples/index.md @@ -0,0 +1,48 @@ +# Examples + +The Merlin package provides a few example workflows. These may be useful in seeing how the software works, and in designing your own workflow. This section provides documentation on running these Merlin workflow examples. + +## Overview + +There are countless ways to write workflows using Merlin but it helps to have some examples as a starting point. This is made even easier with the `merlin example` command. By providing this command the name of a built-in Merlin example, it will download the folder with everything needed to run that example. + +To see a list of all the built-in Merlin examples, run: + +```bash +merlin example list +``` + +Each example will contain at least one `.yaml` file. These are known as Merlin specifications, and are foundational to determining a workflow. + +The Merlin team is working on adding a more diverse array of example workflows like these. + + + + diff --git a/docs/examples/iterative.md b/docs/examples/iterative.md new file mode 100644 index 000000000..7421e453e --- /dev/null +++ b/docs/examples/iterative.md @@ -0,0 +1,3 @@ +# Iterative Examples + +Coming soon! \ No newline at end of file diff --git a/docs/examples/lsf.md b/docs/examples/lsf.md new file mode 100644 index 000000000..5b3d55edb --- /dev/null +++ b/docs/examples/lsf.md @@ -0,0 +1,3 @@ +# LSF Examples + +Coming soon! \ No newline at end of file diff --git a/docs/examples/restart.md b/docs/examples/restart.md new file mode 100644 index 000000000..7cafc9f68 --- /dev/null +++ b/docs/examples/restart.md @@ -0,0 +1,3 @@ +# Restart Examples + +Coming soon! \ No newline at end of file diff --git a/docs/examples/slurm.md b/docs/examples/slurm.md new file mode 100644 index 000000000..402453be0 --- /dev/null +++ b/docs/examples/slurm.md @@ -0,0 +1,3 @@ +# Slurm Examples + +Coming soon! \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 000000000..0b0397a45 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,453 @@ +--- +hide: + - navigation +--- + +# Frequently Asked Questions + +## General + +### What is Merlin? + +Merlin is a distributed task queue system designed to facilitate the large-scale execution of HPC ensembles, like those needed to build machine learning models of complex simulations. + +Read more on what Merlin is and how Merlin works at the [User Guide](./user_guide/index.md). + +### Where can I get help with Merlin? + +In addition to this [documentation](./index.md), you can reach the Merlin developers by email, on Microsoft Teams, or via GitHub. See the [Contact](./contact.md) page for more information. + +### Where can I learn more about Merlin? + +Check out the rest of the [Merlin Documentation](./index.md) and [our paper on arXiv](https://arxiv.org/abs/1912.02892). + +## Setup & Installation + +### How can I build Merlin? + +Merlin can be installed via [pip in a python virtual environment](./user_guide/installation.md#installing-with-virtual-environments-pip) or via [spack](./user_guide/installation.md#installing-with-spack). + +See the [Installation](./user_guide/installation.md) page for full installation instructions. + +### Do I have to build Merlin? + +If you're at LLNL and want to run on LC, you can use the WEAVE team's common environment to run Merlin. For more information, see [WEAVE's Common Environment Docs](https://lc.llnl.gov/weave/environment.html). + +### What are the setup instructions at LLNL? + +See *[Do I have to build Merlin?](#do-i-have-to-build-merlin)* above or visit the [Installation](./user_guide/installation.md) and [Configuration](./user_guide/configuration/index.md) pages of the [User Guide](./user_guide/index.md). + +### How do I reconfigure for different servers? + +The server configuration is set in `~/.merlin/app.yaml`. Details can be found at the [Configuration](./user_guide/configuration/index.md) page, specifically at [Configuring the Broker and Results Backend](./user_guide/configuration/index.md#configuring-the-broker-and-results-backend). + +## Component Technology + +### What underlying libraries does Merlin use? + +- Celery: see *[What is Celery?](#what-is-celery)* below for more information +- Maestro: see *[What is Maestro?](#what-is-maestro)* below for more information + +### What security features are in Merlin? + +Merlin encrypts network traffic of step results, implying that all results are encrypted with a unique user-based key, which is auto-generated and placed in the `~/.merlin/` directory. This allows for multiple users to share a results database. This is important since some backends, like Redis do not allow for multiple distinct users. + +### What is Celery? + +Celery is an asynchronous task/job queue based on distributed message passing. It is focused on real-time operation, but supports scheduling as well. See [Celery's GitHub page](https://github.com/celery/celery) and [Celery's documentation](http://www.celeryproject.org/) for more details. + +### What is Maestro? + +Maestro is a tool and library for specifying and conducting general workflows. See [Maestro's GitHub page](https://github.com/LLNL/maestrowf) and [Maestro's documentation](https://maestrowf.readthedocs.io/en/latest/index.html) for more details. + +## Designing and Building Workflows + +Most of these questions can be answered by reading through the docs on [The Specification File](./user_guide/specification.md) and/or by running through the [Examples](./examples/index.md) that Merlin provides. + +### Where are some example workflows? + +You can see all of Merlin's built-in example workflows with: + +```bash +merlin example list +``` + +See the docs on these [Examples](./examples/index.md) for more information. + +### How do I launch a workflow? + +To launch a workflow locally, use the [`merlin run`](./user_guide/command_line.md#run-merlin-run): + +```bash +merlin run --local +``` + +To launch a distributed workflow, first visit the [Configuration](./user_guide/configuration/index.md) page and configure your broker and results servers. Then use both the [`merlin run`](./user_guide/command_line.md#run-merlin-run) and the [`merlin run-workers`](./user_guide/command_line.md#run-workers-merlin-run-workers) commands in any order you choose: + +=== "Run the Workers" + + ```bash + merlin run-workers + ``` + +=== "Launch the Tasks" + + ```bash + merlin run + ``` + +### How do I describe workflows in Merlin? + +A Merlin workflow is described with [The Specification File](./user_guide/specification.md). + +### What is a DAG? + +DAG is an acronym for 'directed acyclic graph'. This is the way your workflow steps are represented as tasks. + +### What if my workflow can't be described by a DAG? + +There are certain workflows that cannot be explicitly defined by a single DAG; however, in our experience, many can. Furthermore, those workflows that cannot usually do employ DAG sub-components. You probably can gain much of the functionality you want by combining a DAG with control logic return features (like step restart and additional calls to [`merlin run`](./user_guide/command_line.md#run-merlin-run)). + +### How do I implement workflow looping/iteration? + +**Single Step Looping:** + +Combining `exit $(MERLIN_RETRY)` with `max_retries` can allow you to loop a single step. + +**Entire Workflow Looping/Iteration:** + +Entire workflow looping/iteration can be accomplished by finishing off your DAG with a final step that makes another call to [`merlin run`](./user_guide/command_line.md#run-merlin-run). See the [Iterative Demo](./examples/iterative.md) for a detailed example of an iterative workflow: + +```bash +merlin example iterative_demo +``` + +### Can steps be restarted? + +Yes. To build this into a workflow, use `exit $(MERLIN_RETRY)` within a step to retry a failed `cmd` section. The max number of retries in a given step can be specified with the `max_retries` field. + +Alternatively, use `exit $(MERLIN_RESTART)` to run the optional `.run.restart` section. + +To delay a retry or restart directive, add the `retry_delay` field to the step. + +!!! note + + `retry_delay` only works in server mode (ie not `--local` mode). + +To restart failed steps after a workflow is done running, see *[How do I re-rerun failed steps in a workflow?](#how-do-i-re-run-failed-steps-in-a-workflow)* + +### How do I put a time delay in before a restart or retry? + +Add the `retry_delay` field to the step. This specifies how many seconds before the task gets run after the restart. Set this value to large enough for your problem to finish. + +See the [restart_delay example](./examples/restart.md) for syntax: + +```bash +merlin example restart_delay +``` + +!!! note + + `retry_delay` only works in server mode (ie not `--local` mode). + +### I have a long running batch task that needs to restart, what should I do? + +Before your allocation ends, use `$(MERLIN_RESTART)` or `$(MERLIN_RETRY)` but with a `retry_delay` on your step for longer than your allocation has left. The server will hold onto the step for that long (in seconds) before releasing it, allowing your batch allocation to end without the worker grabbing the step right away. + +!!! example + + Here's an example study step that terminates a long running task 1 minute before the allocation ends, then tells Merlin to wait a full 2 minutes before retrying this step. The allocation will then end prior to the step being restarted. + + ```yaml + study: + - name: batch_task + description: A long running task that needs to restart + run: + cmd: | + # Run my code, but end 60 seconds before my allocation + my_code --end_early 60s + if [ -e restart_needed_flag ]; then + exit $(MERLIN_RESTART) + fi + retry_delay: 120 # wait at least 2 minutes before restarting + ``` + +### How do I mark a step failure? + +Each step is ultimately designated as: + +- a success `$(MERLIN_SUCCESS)` -- writes a `MERLIN_FINISHED` file to the step's workspace directory +- a soft failure `$(MERLIN_SOFT_FAIL)` -- allows the workflow to continue +- a hard failure `$(MERLIN_HARD_FAIL)` -- stops the whole workflow by shutting down all workers on that step + +Normally this happens behinds the scenes, so you don't need to worry about it. To hard-code this into your step logic, use a shell command such as `exit $(MERLIN_HARD_FAIL)`. + +!!! note + + The `$(MERLIN_HARD_FAIL)` exit code will shutdown all workers connected to the queue associated with the failed step. To shutdown *all* workers use the `$(MERLIN_STOP_WORKERS)` exit code. + +To rerun all failed steps in a workflow, see *[How do I re-rerun failed steps in a workflow?](#how-do-i-re-run-failed-steps-in-a-workflow)* If you really want a previously successful step to be re-run, you can manually remove the `MERLIN_FINISHED` file prior to running [`merlin restart`](./user_guide/command_line.md#restart-merlin-restart). + +### What fields can be added to steps? + +Steps have a `name`, `description`, and `run` field, as shown below. + +```yaml +study: + - name: + description: + run: + cmd: +``` + +Also under `run`, the following fields are optional: + +```yaml +run: + depends: + task_queue: + shell: + max_retries: + retry_delay: + nodes: + procs: +``` + +For more details on the options that can be used for steps, see [The Specification File](./user_guide/specification.md). + +### How do I specify the language used in a step? + +You can add the field `shell` under the `run` portion of your step to change the language you write your step in. The default is `/bin/bash`, but you can do things like `/usr/bin/env python` as well. + +Use the [feature_demo example](./examples/feature_demo.md) to see an example of this: + +```bash +merlin example feature_demo +``` + +## Running Workflows + +Workflows can be ran with the [`merlin run`](./user_guide/command_line.md#run-merlin-run) and the [`merlin run-workers`](./user_guide/command_line.md#run-workers-merlin-run-workers) commands. + +=== "Send Tasks to the Broker" + + ```bash + merlin run + ``` + +=== "Start Workers to Execute the Tasks" + + ```bash + merlin run-workers + ``` + +See the docs on all [Merlin Commands](./user_guide/command_line.md) that are available for more info on what Merlin is capable of. + +### How do I set up a workspace without executing step scripts? + +Use [Merlin's Dry Run](./user_guide/command_line.md#dry-run) capability: + +=== "Locally" + + ```bash + merlin run --local --dry + ``` + +=== "Distributed" + + ```bash + merlin run --dry ; merlin run-workers + ``` + +### How do I start workers? + +Ensure you have a stable connection to your broker and results servers with the [`merlin info`](./user_guide/command_line.md#info-merlin-info) command: + +```bash +merlin info +``` + +This should show an "OK" message next to both servers. If instead you see "ERROR" next to either server, visit the [Configuring the Broker and Results Backend](./user_guide/configuration/index.md#configuring-the-broker-and-results-backend) documentation to get your servers set up properly. + +Once connection to your servers is established, use the [`merlin run-workers`](./user_guide/command_line.md#run-workers-merlin-run-workers) command to start your workers: + +```bash +merlin run-workers +``` + +### How do I see what workers are connected? + +You can query which workers are active with the [`merlin query-workers`](./user_guide/command_line.md#query-workers-merlin-query-workers) command: + +```bash +merlin query-workers +``` + +This command gives you fine control over which workers you're looking for via a regex on their name, the queue names associated with workers, or even by providing the name of a spec file where workers are defined. + +### How do I stop workers? + +**Interactively:** + +Interactively outside of a workflow (e.g. at the command line), you can stop workers with the [`merlin stop-workers`](./user_guide/command_line.md#stop-workers-merlin-stop-workers) command: + +```bash +merlin stop-workers +``` + +This gives you fine control over which kinds of workers to stop, for instance via a regex on their name, or the queue names you'd like to stop. + +**Within a step:** + +From within a step, you can exit with the `$(MERLIN_STOP_WORKERS)` code, which will issue a time-delayed call to stop all of the workers, or with the `$(MERLIN_HARD_FAIL)` directive, which will stop all workers connected to the current step. This helps prevent the *suicide race condition* where a worker could kill itself before removing the step from the workflow, causing the command to be left there for the next worker and creating a really bad loop. + +You can of course call `merlin stop-workers` from within a step, but be careful to make sure the worker executing it won't be stopped too. + +### How do I re-run failed steps in a workflow? + +Workflows can be restarted with the [`merlin restart`](./user_guide/command_line.md#restart-merlin-restart) command: + +```bash +merlin restart +``` + +This will only re-run steps in your workflow that do not contain a `MERLIN_FINISHED` file in their output directory. + +### What tasks are in my queue? + +Tasks are created by Merlin as it processes the [DAG](#what-is-a-dag) that is defined by the steps in your workflow. For each step, Merlin will create multiple [Celery tasks](https://docs.celeryq.dev/en/stable/userguide/tasks.html) in order to accomplish what you've defined. These tasks are what are sent to the queue on your broker and eventually executed by the workers that you spin up. + +### How do I purge tasks? + +!!! warning + + Tasks that are currently being executed by workers will *not* be purged with the `merlin purge` command. + + It's best to use the [`merlin stop-workers`](./user_guide/command_line.md#stop-workers-merlin-stop-workers) command prior to running `merlin purge`. + +To remove tasks from your queue, you can use the [`merlin purge`](./user_guide/command_line.md#purge-merlin-purge) command: + +```bash +merlin purge +``` + +### Why is stuff still running after I purge? + +You probably have workers executing tasks. Purging removes them from the server queue, but any currently running or reserved tasks are being held by the workers. You need to shut down these workers first with the [`merlin stop-workers`](./user_guide/command_line.md#stop-workers-merlin-stop-workers) command: + +```bash +merlin stop-workers +``` + +...and then run the [`merlin purge`](./user_guide/command_line.md#purge-merlin-purge) command: + +```bash +merlin purge +``` + +### Why am I running old tasks? + +You might have old tasks in your queues. Try [purging your queues](#how-do-i-purge-tasks). + +You might also have rogue workers. Try [checking which workers are connected](#how-do-i-see-what-workers-are-connected). + +### Where do tasks get run? + +When you [spin up workers](#how-do-i-start-workers) with Merlin, these worker processes will live on the node(s) in your allocation. When these workers pull tasks from your broker's queue, the tasks will be executed in the worker processes. Therefore, the tasks are ran on the node(s) in your allocation. + +### Can I run different steps from my workflow on different machines? + +Yes. Under the [`merlin` block](./user_guide/specification.md#the-merlin-block) you can specify which machines your workers are allowed on. In order for this to work, you must then use [`merlin run-workers`](./user_guide/command_line.md#run-workers-merlin-run-workers) separately on each of the specified machines. + +```yaml +merlin: + resources: + workers: + worker_name: + machines: [hostA, hostB, hostC] +``` + +### What is Slurm? + +A job scheduler. See the [Slurm documentation](https://slurm.schedmd.com/documentation.html) for more info. + +### What is LSF? + +A job scheduler. See [IBM's LSF documentation](https://www.ibm.com/support/knowledgecenter/en/SSWRJV_10.1.0/lsf_welcome/lsf_welcome.html) for more info. + +### What is Flux? + +Flux is a hierarchical scheduler and launcher for parallel simulations. It allows the user to specify the same launch command that will work on different HPC clusters with different default schedulers such as [Slurm](#what-is-slurm) or [LSF](#what-is-lsf). Merlin versions earlier than 1.9.2 used the non-Flux native scheduler to launch a Flux instance. Subsequent Merlin versions can launch the Merlin workers using a native Flux scheduler. + +More information can be found at the [Flux web page](http://flux-framework.org/docs/home/). + +Older versions of Flux may need the `--mpi=none` argument if Flux is launched on a system using the Slurm scheduler. This argument can be added in the `launch_args` variable in the [`batch` block](./user_guide/specification.md#the-batch-block) of your spec file. + +!!! example + + ```yaml + batch: + type: flux + launch_args: --mpi=none + ``` + +### What is PBS? + +!!! note + + The PBS functionality is only available to launch a [Flux scheduler](#what-is-flux). + +A job scheduler. See [Portable Batch System](https://en.wikipedia.org/wiki/Portable_Batch_System) for more info. + +### How do I use Flux on LC? + +The `--mpibind=off` option is currently required when using Flux with a Slurm launcher on LC toss3 systems. Set this in the [`batch` block](./user_guide/specification.md#the-batch-block) of your spec as shown in the example below. + +!!! example + + ```yaml + batch: + type: flux + launch_args: --mpibind=off + ``` + +### What is `LAUNCHER`? + +`LAUNCHER` is a reserved variable that may be used in a step command. It serves as an abstraction to launch a job with parallel schedulers like [Slurm](#what-is-slurm), [LSF](#what-is-lsf), and [Flux](#what-is-flux). + +See [The `LAUNCHER` and `VLAUNCHER` Variables](./user_guide/variables.md#the-launcher-and-vlauncher-variables) section for more information. + +### How do I use `LAUNCHER`? + +Instead of this: + +```yaml +run: + cmd: srun -N 1 -n 3 python script.py +``` + +Do something like this: + +```yaml +batch: + type: slurm + +run: + cmd: $(LAUNCHER) python script.py + nodes: 1 + procs: 3 +``` + +See [The `LAUNCHER` and `VLAUNCHER` Variables](./user_guide/variables.md#the-launcher-and-vlauncher-variables) and the [Scheduler Specific Properties](./user_guide/specification.md#scheduler-specific-properties) sections for more information. + +### What is `level_max_dirs`? + +`level_max_dirs` is an optional field that goes under the `merlin.samples` section of a yaml spec. It caps the number of sample directories that can be generated at a single level of a study's sample hierarchy. This is useful for getting around filesystem constraints when working with massive amounts of data. + +Defaults to 25. + +### What is `pgen`? + +`pgen` stands for "parameter generator". It's a way to override the parameters in the [`global.parameters` block](./user_guide/specification.md#the-globalparameters-block) of the spec, instead generating them programatically with a python script. Merlin offers the same pgen functionality as [Maestro](https://maestrowf.readthedocs.io/en/latest/). + +See [Maestro's pgen guide](https://maestrowf.readthedocs.io/en/latest/Maestro/parameter_specification.html#parameter-generator-pgen) for details on using `pgen`. It's a Maestro doc, but the exact same flags can be used in conjunction with [`merlin run`](./user_guide/command_line.md#run-merlin-run). diff --git a/docs/gen_ref_pages.py b/docs/gen_ref_pages.py new file mode 100644 index 000000000..e46c0eb93 --- /dev/null +++ b/docs/gen_ref_pages.py @@ -0,0 +1,40 @@ +"""Generate the code reference pages.""" + +from pathlib import Path + +import mkdocs_gen_files + +nav = mkdocs_gen_files.Nav() + +# print(sorted(Path("merlin").rglob("*.py"))) + +for path in sorted(Path("merlin").rglob("*.py")): + if "merlin/examples" in str(path): + continue + module_path = path.relative_to("merlin").with_suffix("") + doc_path = path.relative_to("merlin").with_suffix(".md") + full_doc_path = Path("api_reference", doc_path) + + parts = list(module_path.parts) + + if parts[-1] == "__init__": # + parts = parts[:-1] + doc_path = doc_path.with_name("index.md") + full_doc_path = full_doc_path.with_name("index.md") + if len(parts) == 0: + continue + elif parts[-1] == "__main__": + continue + + nav[parts] = doc_path.as_posix() + + with mkdocs_gen_files.open(full_doc_path, "w") as fd: + identifier = ".".join(parts) + print("::: " + identifier, file=fd) + + mkdocs_gen_files.set_edit_path(full_doc_path, path) + + +# NOTE: SUMMARY.md has to be the name of the nav file +with mkdocs_gen_files.open("api_reference/SUMMARY.md", "w") as nav_file: + nav_file.writelines(nav.build_literate_nav()) diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 000000000..46466494f --- /dev/null +++ b/docs/index.md @@ -0,0 +1,226 @@ +--- +hide: + - navigation +--- + +![Merlin Banner](./assets/images/merlin_banner_white.png){ width="75%" id="landing-page-image" } + +---- + +# Merlin + +Empower your projects with Merlin, a cloud-based workflow manager designed to facilitate scalable and reproducible workflows, particularly suited for running many simulations and iterative procedures. + +[On GitHub :fontawesome-brands-github:](https://github.com/LLNL/merlin){: .md-button .md-button--primary } + +---- + +## Why Merlin? + +Workflows, applications and machines are becoming more complex, but subject matter experts need to devote time and attention to their applications and often require fine command-line level control. Furthermore, they rarely have the time to devote to learning workflow systems. + +With the expansion of data-driven computing, the HPC scientist needs to be able to run more simulations through complex multi-component workflows. + +**Merlin targets HPC workflows that require many simulations**.[^1] + +## Goals and Motivations + +Merlin was created with the intention of providing flexible and reproducible workflows to users at a scale that could be much larger than [Maestro](https://maestrowf.readthedocs.io/en/latest/). Since Merlin is built as an extension of Maestro, we wanted to maintain [Maestro's Goals and Motivations](https://maestrowf.readthedocs.io/en/latest/#maestros-goals-and-motivations) while at the same time providing users the ability to become their own big-data generator. + +In the pursuit of flexible and reproducible worflows, Merlin places a paramount emphasis on workflow provenance. We recognize the importance of understanding how workflows evolve, ensuring that every decision, parameter adjustment, and execution is meticulously documented. Workflow provenance is not just a feature for us; it's a fundamental element that contributes to the reliability and trustworthiness of your studies. + +Merlin understands the dynamic nature of your work, especially when dealing with large-scale simulations. Our goal is to provide a platform that seamlessly scales to accommodate the computational demands of extensive simulations, ensuring that your workflows remain efficient and effective, even in the face of substantial computational requirements. + +## Getting Started + +### Install Merlin + +Merlin can be [installed](./user_guide/installation.md) via pip in your own virtual environment. + +1. First, create a virtual environment: + + ```bash + python -m venv merlin_venv + ``` + +2. Now activate the virtual environment: + + === "bash" + + ```bash + source merlin_venv/bin/activate + ``` + + === "csh" + + ```csh + source merlin_venv/bin/activate.csh + ``` + +3. Finally, install Merlin with pip: + + ```bash + pip install merlin + ``` + +### Create a Containerized Server + +First, let's create a folder to store our server files and our examples. We'll also move into this directory: + +```bash +mkdir merlin_examples ; cd merlin_examples/ +``` + +Now let's set up a [containerized server](./user_guide/configuration/merlin_server.md) that Merlin can connect to. + +1. Initialize the server files: + + ```bash + merlin server init + ``` + +2. Start the server: + + ```bash + merlin server start + ``` + +3. Copy the `app.yaml` configuration file from `merlin_server/` to your current directory: + + ```bash + cp merlin_server/app.yaml . + ``` + +4. Check that your server connection is working properly: + + ```bash + merlin info + ``` + + Your broker and results server should both look like so: + + !!! success + + ```bash + . + . + . + Checking server connections: + ---------------------------- + broker server connection: OK + results server connection: OK + . + . + . + ``` + +### Run an Example Workflow + +Let's download Merlin's built-in ["Hello, World!" example](./examples/hello.md): + +```bash +merlin example hello +``` + +Now that we've downloaded the example, enter the `hello/` directory: + +```bash +cd hello/ +``` + +In this directory there are files named `hello.yaml` and `hello_samples.yaml`. These are what are known as Merlin [specification (spec) files](./user_guide/specification.md). The `hello.yaml` spec is a very basic example that will also work with [Maestro](https://maestrowf.readthedocs.io/en/latest/). We'll focus on `hello_samples.yaml` here as it has more Merlin specific features: + +```yaml +description: # (1) + name: hello_samples + description: a very simple merlin workflow, with samples + +env: + variables: # (2) + N_SAMPLES: 3 + +global.parameters: + GREET: # (3) + values : ["hello","hola"] + label : GREET.%% + +study: + - name: step_1 + description: say hello + run: # (4) + cmd: | + echo "$(GREET), $(WORLD)!" + + - name: step_2 + description: print a success message + run: # (5) + cmd: print("Hurrah, we did it!") + depends: [step_1_*] # (6) + shell: /usr/bin/env python3 + +merlin: + resources: + workers: # (7) + demo_worker: + args: -l INFO --concurrency=1 + steps: [all] + samples: # (8) + generate: + cmd: python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [WORLD] +``` + +1. Mandatory name and description fields to encourage well documented workflows +2. Define single valued variable tokens for use in your workflow steps +3. Define parameter tokens of the form `$(NAME)` and lists of values to use in your steps such that Merlin can parameterize them for you +4. Here, `cmd` is a multline string written in bash to harness the robust existing ecosystem of tools users are already familiar with +5. Here, `cmd` is a single line string written in python. Merlin allows users to modify the `shell` that `cmd` uses to execute a step +6. Specify step dependencies using steps' `name` values to control execution order +7. Define custom workers to process your workflow in the most efficient manner +8. Generate samples to be used throughout your workflow. These can be used similar to parameters; use the [`$(SAMPLE_NAME)` syntax](./user_guide/variables.md#token-syntax) (as can be seen in `step_1`) + +We have two ways to run the `hello_samples.yaml` example: + +=== "In a Distributed Manner" + + Send tasks to the broker: + + ```bash + merlin run hello_samples.yaml + ``` + + Start the workers to execute the tasks: + + ```bash + merlin run-workers hello_samples.yaml + ``` + +=== "Locally" + + Execute the tasks locally without needing to comminucate with the containerized server we just established: + + ```bash + merlin run --local hello_samples.yaml + ``` + +Running the workflow will first convert your steps into a task execution graph and then create a workspace directory with the results of running your study. + +The directed acyclic graph (DAG) that's created for the `hello_samples.yaml` example will look like so: + +![DAG for hello_samples](./assets/images/tutorial/hello_world/dag4.png) + +If ran successfully, a workspace for your run should've been created with the name `hello_samples_/`. Below shows the expected contents of this workspace: + +!!! success "Contents of `hello_samples_`" + + + ![Contents of hello_samples Workspace](./assets/images/hello-samples-tree.png) + +## Release + +Merlin is released under an MIT license. For more information, please see the [LICENSE](https://github.com/LLNL/merlin/blob/develop/LICENSE). + +``LLNL-CODE-797170`` + +[^1]: See [*Enabling Machine Learning-Ready HPC Ensembles with Merlin*](https://arxiv.org/abs/1912.02892) for a paper that mentions a study with up to 40 million simulations. \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 3479181c8..000000000 --- a/docs/make.bat +++ /dev/null @@ -1,36 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build -set SPHINXPROJ=Merlin - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd diff --git a/docs/requirements.in b/docs/requirements.in deleted file mode 100644 index 268785121..000000000 --- a/docs/requirements.in +++ /dev/null @@ -1,4 +0,0 @@ -# This file will list all requirements for the docs so we can freeze a version of them for release. -# To freeze the versions run: -# pip-compile requirements.in -sphinx \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index 5d3faecfe..e4144dee5 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1 +1,9 @@ -sphinx>=5.3.0 +markdown-grid-tables +mkdocs +mkdocs-codeinclude-plugin +mkdocs-gen-files +mkdocs-glightbox +mkdocs-literate-nav +mkdocs-material +mkdocstrings +mkdocstrings-python diff --git a/docs/source/.gitignore b/docs/source/.gitignore deleted file mode 100644 index 8cdea50f2..000000000 --- a/docs/source/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -# auto generated files from sphinx-autoapi -merlin.rst -merlin.*.rst - diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css deleted file mode 100644 index b89e9d889..000000000 --- a/docs/source/_static/custom.css +++ /dev/null @@ -1,39 +0,0 @@ -/* Copy buttons */ -button.copybtn { - webkit-transition: opacity .3s ease-in-out; - -o-transition: opacity .3s ease-in-out; - transition: opacity .3s ease-in-out; - opacity: 0; - padding: 2px 6px; - position: absolute; - right: 4px; - top: 4px; -} -div.highlight:hover .copybtn, div.highlight .copybtn:focus { - opacity: .3; -} -div.highlight .copybtn:hover { - opacity: 1; -} -div.highlight { - position: relative; -} -div.sphinxsidebar { - max-height: 100%; - overflow-y: auto; -} -td { - max-width: 300px; -} -@media screen and (min-width: 875px) { - .sphinxsidebar { - background-color: #fff; - margin-left: 0; - z-index: 1; - height: 100vh; - top: 0px; - } -} -.underline { - text-decoration: underline; -} diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js deleted file mode 100644 index 6bdf11c7e..000000000 --- a/docs/source/_static/custom.js +++ /dev/null @@ -1,25 +0,0 @@ -function addCopyButtonToCode(){ - // get all code elements - var allCodeBlocksElements = $( "div.highlight pre" ); - - // For each element, do the following steps - allCodeBlocksElements.each(function(ii) { - // define a unique id for this element and add it - var currentId = "codeblock" + (ii + 1); - $(this).attr('id', currentId); - - // create a button that's configured for clipboard.js - // point it to the text that's in this code block - // add the button just after the text in the code block w/ jquery - var clipButton = ''; - $(this).after(clipButton); - }); - - // tell clipboard.js to look for clicks that match this query - new Clipboard('.btn'); - } - - $(document).ready(function () { - // Once the DOM is loaded for the page, attach clipboard buttons - addCopyButtonToCode(); - }); diff --git a/docs/source/app_config/app_amqp.yaml b/docs/source/app_config/app_amqp.yaml deleted file mode 100644 index f7de950c8..000000000 --- a/docs/source/app_config/app_amqp.yaml +++ /dev/null @@ -1,58 +0,0 @@ - -celery: - # directory where Merlin looks for the following: - # mysql-ca-cert.pem rabbit-client-cert.pem rabbit-client-key.pem redis.pass - certs: /path/to/celery/config - -broker: - # can be rabbitmq, redis, rediss, or redis+sock - name: rabbitmq - #username: # defaults to your username unless changed here - password: ~/.merlin/rabbit-password - # server URL - server: server.domain.com - - ### for rabbitmq connections ### - #vhost: # defaults to your username unless changed here - - ### for redis+sock connections ### - #socketname: the socket name your redis connection can be found on. - #path: The path to the socket. - - ### for redis/rediss connections ### - #port: The port number redis is listening on (default 6379) - #db_num: The data base number to connect to. - - # ssl security - #keyfile: /var/ssl/private/client-key.pem - #certfile: /var/ssl/amqp-server-cert.pem - #ca_certs: /var/ssl/myca.pem - # This is optional and can be required, optional or none - # (required is the default) - #cert_reqs: required - - -results_backend: - # Can be redis,rediss, mysql, db+ or memcached server - # Only a few of these are directly configured by merlin - name: redis - - dbname: dbname - username: username - # name of file where redis password is stored. - password: redis.pass - server: server.domain.com - # merlin will generate this key if it does not exist yet, - # and will use it to encrypt all data over the wire to - # your redis server. - encryption_key: ~/.merlin/encrypt_data_key - port: 6379 - db_num: 0 - - # ssl security - #keyfile: /var/ssl/private/client-key.pem - #certfile: /var/ssl/amqp-server-cert.pem - #ca_certs: /var/ssl/myca.pem - # This is optional and can be required, optional or none - # (required is the default) - #cert_reqs: required diff --git a/docs/source/celery_overview.rst b/docs/source/celery_overview.rst deleted file mode 100644 index 07f589dc1..000000000 --- a/docs/source/celery_overview.rst +++ /dev/null @@ -1,125 +0,0 @@ -Celery -====== - -Merlin uses `Celery `_, a Python based -distributed task management system. Merlin uses Celery to queue work which -is processed by Celery workers. - -Merlin queues tasks to the broker which receives and routes tasks. Merlin by -default is configured to use `RabbitMQ `_. - -Celery has many functions, it defines the interface to the task broker, the -backend results database and the workers that will run the tasks. - -The broker and backend are configured through the app.yaml file. A -configuration for the rabbit ampq server is shown below. - -.. literalinclude:: app_config/app_amqp.yaml - - -The default location for the app.yaml is in the merlin repo under the -config directory. This default can be overridden by files in one of two -other locations. The current working directory is first checked for the -app.yaml file, then the user's ``~/.merlin`` directory is checked. - -The celery command needs application configuration for the specific module that -includes celery, this is specified using the ``-A `` syntax. All celery -commands should include the ``-A`` argument. - -.. code-block:: python - - celery -A merlin - -The merlin run command will define the tasks from the steps in the yaml file -and then send them to the broker through the celery broker interface. If these -tasks are no longer needed or are incorrect, they can be purged by using one of -these commands: - -.. code-block:: python - - celery -A merlin -Q purge - # This is the equivalent of merlin purge - e.g. - celery -A merlin -Q merlin,queue2,queue3 purge - - or with rabbitmq: - celery -A merlin amqp queue.purge - e.g. - celery -A merlin amqp queue.purge merlin - - a third option with rabbitmq is deleting the queue - celery -A merlin amqp queue.delete - e.g. - celery -A merlin amqp queue.delete merlin - - -.. _celery-config: - -Configuring celery workers -__________________________ - -The common configurations used for the celery workers in the -`celery workers guide `_ -are not the best for HPC applications. Here are some parameters you -may want to use for HPC specific workflows. - -These options can be altered by setting the args for an entry of type -worker in the merlin resources section. - -The number of threads to use on each node of the HPC allocation is set -through the ``--concurrency`` keyword. A good choice for this is the number of -simulations that can be run per node. - -.. code-block:: bash - - celery -A merlin worker --concurrency - - e.g. - # If the HPC simulation is a simple 1D short running sim - # then on Lassen you might want to use all Hardware threads. - celery -A merlin worker --concurrency 160 - - # If the HPC simulation will take the whole node you may want - # to limit this to only a few threads. - celery -A merlin worker --concurrency 2 - - -The ``--prefetch-multiplier`` argument sets how many tasks are requested from -the task server per worker thread. If ``--concurrency`` is 2 and -``--prefetch-multiplier`` is 3, then 6 tasks will be requested from the task -server by the worker threads. Since HPC tasks are generally not short -running tasks, the recommendation is to set this to 1. - -.. code-block:: bash - - celery -A merlin worker --prefetch-multiplier - e.g. - celery -A merlin worker --prefetch-multiplier 1 - -The ``-O fair`` option is another parameter used for long running celery -tasks. With this set, celery will only send tasks to threads that are -available to run them. - -.. code-block:: bash - - celery -A merlin worker -O fair - -The ``-n`` option allows the workers to be given a unique name so multiple -workers running tasks from different queues may share the allocation -resources. The names are automatically set to ``.%h``, where -```` is from the task_queue config or merlin (default) and ``%h`` -will resolve to the hostname of the compute node. - -.. code-block:: bash - - celery -A merlin worker -n - e.g. - celery -A merlin worker -n merlin.%h - or - celery -A merlin worker -n queue_1.%h - -On the toss3 nodes, the CPU affinity can be set for the worker processes. -This is enabled by setting the environment variable ``CELERY_AFFINIITY`` to the -number of CPUs to skip. -e.g. ``export CELERY_AFFINIITY=4`` -This will skip 4 CPUs between each celery worker thread. diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index b578e8672..000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,183 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -from datetime import date -import os -import sys - -sys.path.insert(0, os.path.abspath("../..")) - -MERLIN_VERSION = __import__("merlin").VERSION - -# -- Project information ----------------------------------------------------- - -_year = date.today().year - -project = "Merlin" -copyright = "{}, LLNL: LLNL-CODE-797170".format(_year) -author = "Lawrence Livermore National Laboratory" - -# The short X.Y version -version = MERLIN_VERSION -# The full version, including alpha/beta/rc tags -release = MERLIN_VERSION - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -# extensions = [ -# 'sphinx.ext.autodoc', -# 'sphinx.ext.intersphinx', -# ] -extensions = ["sphinx.ext.autodoc"] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The master toctree document. -master_doc = "index" - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = "en" - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path . -exclude_patterns = [] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "alabaster" - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -html_theme_options = { - "description": "Machine learning for HPC workflows", - "github_user": "LLNL", - "github_repo": "merlin", - "fixed_sidebar": True, - "show_relbars": True, -} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -html_css_files = ["custom.css"] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = "Merlindoc" - - -# -- Options for LaTeX output ------------------------------------------------ - -latex_engine = "pdflatex" -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # - 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # - 'preamble': '', - # Latex figure (float) alignment - # - 'figure_align': 'htbp', -} - -latex_logo = "../images/merlin.png" - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, "Merlin.tex", "Merlin Documentation", "The Merlin Development Team", "manual"), -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "merlin", "Merlin Documentation", [author], 1)] - - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, "Merlin", "Merlin Documentation", author, "Merlin", "One line description of project.", "Miscellaneous"), -] - - -# -- Extension configuration ------------------------------------------------- - -primary_domain = "py" - -highlight_language = "bash" - -intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} - - -def setup(app): - try: - app.add_javascript("custom.js") - app.add_javascript("https://cdn.jsdelivr.net/npm/clipboard@1/dist/clipboard.min.js") - except AttributeError: - app.add_js_file("custom.js") - app.add_js_file("https://cdn.jsdelivr.net/npm/clipboard@1/dist/clipboard.min.js") diff --git a/docs/source/docker.rst b/docs/source/docker.rst deleted file mode 100644 index f5c3b164a..000000000 --- a/docs/source/docker.rst +++ /dev/null @@ -1,93 +0,0 @@ -Docker -====== - -Merlin has a simple Dockerfile description for running a container -with all requirements installed. - - -Build the container -******************* - -The docker container can be built by building in the top level -merlin directory. - -.. code:: bash - - docker build -t merlin . - - -This will create a merlin:latest image in your docker image -collection with a user "merlinu" and a WORKDIR set to /home/merlinu. - -.. code:: bash - - docker images - - -Run the container -***************** - -The container can be run in detached mode to provide both the ``merlin`` -and ``celery`` commands - -.. code:: bash - - docker run --rm -td --name my-merlin merlin - alias merlin="docker exec my-merlin merlin" - alias celery="docker exec my-merlin celery" - -Examples can be run through docker containers by first starting a server -for the broker and backend. The server can be a redis or rabbitmq , for this -demonstration a redis server will be used. The backend will always be a -redis server. - -.. code:: bash - - docker pull redis - docker run -d -p 6379:6379 --name my-redis redis - -A local output directory can be defined -by using the ``--volume`` docker arguments. It is -recommended that a fixed directory be used for the ``--volume`` argument. -The merlin docker container is linked to the redis server above by using -the ``--link`` option. - -.. code:: bash - - # Create local working directory - mkdir $HOME/merlinu - cd $HOME/merlinu - - docker pull llnl/merlin - docker run --rm -td --name my-merlin --link my-redis --volume "$HOME/merlinu":/home/merlinu llnl/merlin - - alias merlin="docker exec my-merlin merlin" - alias celery="docker exec my-merlin celery" - - # Create the $HOME/merlinu/.merlin/app.yaml using redis - merlin config --broker redis - - - - # Copy an example to the local dir - merlin example feature_demo - - # Run a test run without workers - merlin run feature_demo/feature_demo.yaml --dry --local - - # Define the tasks and load them on the broker - merlin run feature_demo/feature_demo.yaml - - # Start workers to pull tasks from the server and run them in the container - merlin run-workers feature_demo/feature_demo.yaml - - -A shell can started in the container by using the -``--entrypoint`` command. If the user would like to examine the container -contents, they can use a shell as the entry point. - -.. code:: bash - - docker run --rm -ti --volume "$HOME/merlinu":/home/merlinu --entrypoint="/bin/bash" merlin - - diff --git a/docs/source/faq.rst b/docs/source/faq.rst deleted file mode 100644 index 3632aab6c..000000000 --- a/docs/source/faq.rst +++ /dev/null @@ -1,476 +0,0 @@ -.. _faq: - -.. role:: underline - :class: underline - -FAQ -=== -.. contents:: Frequently Asked Questions - :local: - -General -------- -What is Merlin? -~~~~~~~~~~~~~~~ -Merlin is a distributed task queue system -designed to facilitate the large scale -execution of HPC ensembles, like those -needed to build machine learning models -of complex simulations. - -Where can I get help with Merlin? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In addition to this :doc:`documentation <./index>`, -the Merlin developers can be reached at -merlin@llnl.gov. -You can also reach out to the merlin user -group mailing list: merlin-users@listserv.llnl.gov. - -Setup & Installation --------------------- - -How can I build Merlin? -~~~~~~~~~~~~~~~~~~~~~~~ -Merlin can be installed via -`pip `_ in a python -:doc:`virtual environment <./virtualenv>` -or via :doc:`spack <./spack>`. - -See :doc:`Getting started <./getting_started>`. - -Do I have to build Merlin? -~~~~~~~~~~~~~~~~~~~~~~~~~~ -If you're at LLNL and want to run on LC, you -can use one of the public deployments. -For more information, check out the LLNL access page -in confluence. - -What are the setup instructions at LLNL? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -See "Do I have to build Merlin" - -How do I reconfigure for different servers? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The server configuration is set in ``~/.merlin/app.yaml``. -Details can be found :doc:`here <./merlin_config>`. - -Component Technology --------------------- -What underlying libraries does Merlin use? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Celery - * :ref:`what-is-celery` -* Maestro - * :ref:`what-is-maestro` - -What security features are in Merlin? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Merlin encrypts network traffic of step results, -implying that all results are encrypted with a -unique user-based key, which is auto-generated -and placed in ``~/.merlin/``. This allows -for multiple users to share a results database. -This is important since some backends, like -redis do not allow for multiple distinct users. - -.. _what-is-celery: - -What is celery? -~~~~~~~~~~~~~~~ -Celery is an asynchronous task/job queue based on distributed message passing. -It is focused on real-time operation, but supports scheduling as well. -See `Celery's GitHub page -`_ -and `Celery's website -`_ for more details. - -.. _what-is-maestro: - -What is maestro? -~~~~~~~~~~~~~~~~ -Maestro is a tool and library for specifying and conducting -general workflows. -See `Maestro's GitHub page -`_ -for more details. - -Designing and Building Workflows --------------------------------- -:doc:`yaml specification file <./merlin_specification>` - -Where are some example workflows? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: bash - - $ merlin example list - -How do I launch a workflow? -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To launch a workflow locally, use ``merlin run --local ``. -To launch a distributed workflow, use ``merlin run-workers ``, -and ``merlin run ``. -These may be done in any order. - -How do I describe workflows in Merlin? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -A Merlin workflow is described with a :doc:`yaml specification file <./merlin_specification>`. - -What is a DAG? -~~~~~~~~~~~~~~ -DAG is an acronym for 'directed acyclic graph'. -This is the way your workflow steps are represented as tasks. - -What if my workflow can't be described by a DAG? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -There are certain workflows that cannot be explicitly defined by a single DAG; however, in our experience, many can. -Furthermore, those workflows that cannot usually do employ DAG sub-components. -You probably can gain much of the functionality you want by combining a DAG with control logic return features (like step restart and additional calls to ``merlin run``). - - -How do I implement workflow looping / iteration? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Combining ``exit $(MERLIN_RETRY)`` with ``max_retries`` can allow you to loop a single step. -Entire workflow looping / iteration can be accomplished by finishing off your DAG with a final step that makes another call to ``merlin run``. - - -Can steps be restarted? -~~~~~~~~~~~~~~~~~~~~~~~ -Yes. To build this into a workflow, use ``exit $(MERLIN_RETRY)`` within a step to retry a failed ``cmd`` section. -The max number of retries in given step can be specified with the ``max_retries`` field. - -Alternatively, use ``exit $(MERLIN_RESTART)`` to run the optional ``.run.restart`` section. - -To delay a retry or restart directive, add the ``retry_delay`` field to the step. -Note: ``retry_delay`` only works in server mode (ie not ``--local`` mode). - -To restart failed steps after a workflow is done running, see :ref:`restart`. - - -How do I put a time delay in before a restart or retry? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Add the ``retry_delay`` field to the step. This specifies how many seconds before the task -gets run after the restart. Set this value to large enough for your problem to finish. - -See the ``merlin example restart_delay`` example for syntax. - -Note: ``retry_delay`` only works in server mode (ie not ``--local`` mode). - -I have a long running batch task that needs to restart, what should I do? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Before your allocation ends, use ``$(MERLIN_RESTART)`` or ``$(MERLIN_RETRY)`` but -with a ``retry_delay`` on your step for longer that your allocation has left. -The server will hold onto the step for that long (in seconds) before releasing it, -allowing your batch allocation to end without the worker grabbing the step right away. - -For instance, your step could look something like this - -.. code:: yaml - - name: batch_task - description: A long running task that needs to restart - run: - cmd: | - # Run my code, but end 60 seconds before my allocation - my_code --end_early 60s - if [ -e restart_needed_flag ]; then - exit $(MERLIN_RESTART) - fi - retry_delay: 120 # wait at least 2 minutes before restarting - -How do I mark a step failure? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Each step is ultimately designated as: -* a success ``$(MERLIN_SUCCESS)`` -- writes a ``MERLIN_FINISHED`` file to the step's workspace directory -* a soft failure ``$(MERLIN_SOFT_FAIL)`` -- allows the workflow to continue -* a hard failure ``$(MERLIN_HARD_FAIL)`` -- stops the whole workflow by shutting down all workers on that step - -Normally this happens behinds the scenes, so you don't need to worry about it. -To hard-code this into your step logic, use a shell command such as ``exit $(MERLIN_HARD_FAIL)``. - -.. note:: - The ``$(MERLIN_HARD_FAIL)`` exit code will shutdown all workers connected to the queue associated - with the failed step. To shutdown *all* workers use the ``$(MERLIN_STOP_WORKERS)`` exit code - -To rerun all failed steps in a workflow, see :ref:`restart`. -If you really want a previously successful step to be re-run, you can first manually remove the ``MERLIN_FINISHED`` file. - - -What fields can be added to steps? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Steps have a ``name``, ``description``, and ``run`` field, as shown below. - -.. code:: yaml - - name: - description: - run: - cmd: - -Also under ``run``, the following fields are optional: - -.. code:: yaml - - run: - depends: - task_queue: - shell: - max_retries: - retry_delay: - nodes: - procs: - -How do I specify the language used in a step? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You can add the field ``shell`` under the ``run`` portion of your step -to change the language you write your step in. The default is ``/bin/bash``, -but you can do things like ``/usr/bin/env python`` as well. -Use ``merlin example feature_demo`` to see an example of this. - -Running Workflows ------------------ - -.. code:: bash - - $ merlin run - -For more details, see :doc:`Merlin commands<./merlin_commands>`. - -How do I set up a workspace without executing step scripts? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: bash - - $ merlin run --dry - -How do I start workers? -~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: bash - - $ merlin run-workers - -How do I see what workers are connected? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: bash - - $ merlin query-workers - -This command gives you fine control over which workers you're looking for via -a regex on their name, the queue names associated with workers, or even by providing -the name of a spec file where workers are defined. - -For more info, see :ref:`query-workers`. - -How do I stop workers? -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Interactively outside of a workflow (e.g. at the command line), you can do this with - -.. code:: bash - - $ merlin stop-workers - -This gives you fine control over which kinds of workers to stop, for instance via -a regex on their name, or the queue names you'd like to stop. - -From within a step, you can exit with the ``$(MERLIN_STOP_WORKERS)`` code, which will -issue a time-delayed call to stop all of the workers, or with the ``$(MERLIN_HARD_FAIL)`` -directive, which will stop all workers connected to the current step. This helps prevent -the *suicide race condition* where a worker could kill itself before removing the step -from the workflow, causing the command to be left there for the next worker and creating -a really bad loop. - -You can of course call ``merlin stop-workers`` from within a step, but be careful to make -sure the worker executing it won't be stopped too. - -For more tricks, see :ref:`stop-workers`. - -.. _restart: - -How do I re-run failed steps in a workflow? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: bash - - $ merlin restart - -What tasks are in my queue? -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -How do I purge tasks? -~~~~~~~~~~~~~~~~~~~~~ - -.. code:: bash - - $ merlin purge - -Why is stuff still running after I purge? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You probably have workers executing tasks. Purging -removes them from the server queue, but any currently -running or reserved tasks are being held by the workers. -You need to shut down these workers first: - -.. code:: bash - - $ merlin stop-workers - $ merlin purge - -Why am I running old tasks? -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -You might have old tasks in your queues. Try ``merlin purge ``. -You might also have rogue workers. To find out, try ``merlin query-workers``. - -Where do tasks get run? -~~~~~~~~~~~~~~~~~~~~~~~ - -Can I run different steps from my workflow on different machines? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Yes. Under the ``merlin`` block you can specify which machines your workers are allowed on. -In order for this to work, you must then use ``merlin run-workers`` separately on each of -the specified machines. - -.. code:: yaml - - merlin: - resources: - workers: - worker_name: - machines: [hostA, hostB, hostC] - -.. _slurm: - -What is Slurm? -~~~~~~~~~~~~~~ -A job scheduler. See `Slurm documentation -`_ -. - -.. _lsf: - -What is LSF? -~~~~~~~~~~~~ -Another job scheduler. See `IBM's LSF documentation -`_ -. - -.. _flux: - -What is flux? -~~~~~~~~~~~~~ -Flux is a hierarchical scheduler and launcher for parallel simulations. It allows the user -to specify the same launch command that will work on different HPC clusters with different -default schedulers such as SLURM or LSF. Merlin versions earlier than 1.9.2 used the non-flux native -scheduler to launch a flux instance. Subsequent merlin versions can launch the merlin workers -using a native flux scheduler. -More information can be found at the `Flux web page `_. - - -Older versions of flux may need the ``--mpi=none`` argument if flux is -launched on a system using the SLURM scheduler. This argument can be added -in the ``launch_args`` variable in the batch section. - -.. code:: yaml - - batch: - type: flux - launch_args: --mpi=none - -.. _pbs: - -What is PBS? -~~~~~~~~~~~~ -Another job scheduler. See `Portable Batch System -`_ -. -This functionality is only available to launch a flux scheduler. - -How do I use flux on LC? -~~~~~~~~~~~~~~~~~~~~~~~~ -The ``--mpibind=off`` option is currently required when using flux with a slurm launcher -on LC toss3 systems. Set this in the batch section as shown in the example below. - -.. code:: yaml - - batch: - type: flux - launch_args: --mpibind=off - -What is ``LAUNCHER``? -~~~~~~~~~~~~~~~~~~~~~ -``$LAUNCHER`` is a reserved word that may be used in a step command. It serves as an abstraction to launch a job with parallel schedulers like :ref:`slurm`, :ref:`lsf`, and :ref:`flux`. - -How do I use ``LAUNCHER``? -~~~~~~~~~~~~~~~~~~~~~~~~~~ -Instead of this: - -.. code:: yaml - - run: - cmd: srun -N 1 -n 3 python script.py - -Do something like this: - -.. code:: yaml - - batch: - type: slurm - - run: - cmd: $(LAUNCHER) python script.py - nodes: 1 - procs: 3 - -:underline:`The arguments the LAUNCHER syntax will use`: - -``procs``: The total number of MPI tasks - -``nodes``: The total number of MPI nodes - -``walltime``: The total walltime of the run (hh:mm:ss or mm:ss or ss) (not available in lsf) - -``cores per task``: The number of hardware threads per MPI task - -``gpus per task``: The number of GPUs per MPI task - -:underline:`SLURM specific run flags`: - -``slurm``: Verbatim flags only for the srun parallel launch (srun -n -n ) - -:underline:`FLUX specific run flags`: - -``flux``: Verbatim flags for the flux parallel launch (flux mini run ) - -:underline:`LSF specific run flags`: - -``bind``: Flag for MPI binding of tasks on a node (default: -b rs) - -``num resource set``: Number of resource sets - -``launch_distribution``: The distribution of resources (default: plane:{procs/nodes}) - -``lsf``: Verbatim flags only for the lsf parallel launch (jsrun ... ) - -What is level_max_dirs? -~~~~~~~~~~~~~~~~~~~~~~~ -``level_max_dirs`` is an optional field that goes under the ``merlin.samples`` section -of a yaml spec. It caps the number of sample directories that can be generated -at a single level of a study's sample hierarchy. This is useful for getting around -filesystem constraints when working with massive amounts of data. - -Defaults to 25. - -What is pgen? -~~~~~~~~~~~~~ -``pgen`` stands for "parameter generator". It's a way to override the parameters in the -``global.parameters`` spec section, instead generating them programatically with a python script. -Merlin offers the same pgen functionality as Maestro. - -See `this guide `_ for details on using ``pgen``. -It's a Maestro doc, but the exact same flags can be used in conjunction with ``merlin run``. - -Where can I learn more about merlin? -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Check out `our paper `_ on arXiv. diff --git a/docs/source/getting_started.rst b/docs/source/getting_started.rst deleted file mode 100644 index 4b1a4c1a3..000000000 --- a/docs/source/getting_started.rst +++ /dev/null @@ -1,156 +0,0 @@ -Getting Started -================ - -Quick Start -++++++++++++++ -:: - - pip3 install merlin - -All set up? See the :doc:`Merlin Commands <./merlin_commands>` section for using merlin. - -Check out the :doc:`Tutorial<./tutorial>`! - -Developer Setup -++++++++++++++++++ -The developer setup can be done via pip or via make. This section will cover how to do both. - -Additionally, there is an alternative method to setup merlin on supercomputers. See the :doc:`Spack <./spack>` section for more details. - -Pip Setup -****************** - -To install with the additional developer dependencies, use:: - - pip3 install "merlin[dev]" - -or:: - - pip3 install -e "git+https://github.com/LLNL/merlin.git@develop#egg=merlin[dev]" - -Make Setup -******************* - -Visit the `Merlin repository `_ on github. `Create a fork of the repo `_ and `clone it `_ onto your system. - -Change directories into the merlin repo: - -.. code-block:: bash - - $ cd merlin/ - -Install Merlin with the developer dependencies: - -.. code-block:: bash - - $ make install-dev - -This will create a virtualenv, start it, and install Merlin and it's dependencies for you. - -More documentation about using Virtualenvs with Merlin can be found at -:doc:`Using Virtualenvs with Merlin <./virtualenv>`. - -We can make sure it's installed by running: - -.. code-block:: bash - - $ merlin --version - -If you don't see a version number, you may need to restart your virtualenv and try again. - -Configuring Merlin -******************* - -Once Merlin has been installed, the installation needs to be configured. -Documentation for merlin configuration is in the :doc:`Configuring Merlin <./merlin_config>` section. - -That's it. To start running Merlin see the :doc:`Merlin Workflows. <./merlin_workflows>` - -(Optional) Testing Merlin -************************* - -.. warning:: - - With python 3.6 you may see some tests fail and a unicode error presented. To fix this, you need to reset the LC_ALL environment variable to en_US.utf8. - -If you have ``make`` installed and the `Merlin repository `_ cloned, you can run the test suite provided in the Makefile by running: - -.. code-block:: bash - - $ make tests - -This will run both the unit tests suite and the end-to-end tests suite. - -If you'd just like to run the unit tests you can run: - -.. code-block:: bash - - $ make unit-tests - -Similarly, if you'd just like to run the end-to-end tests you can run: - -.. code-block:: bash - - $ make e2e-tests - -Custom Setup -+++++++++++++ - -This section documents how to install Merlin without using the Makefile. This -setup is more complicated; however, allows for more customization of the setup -configurations. - -Clone the `Merlin `_ -repository:: - - git clone https://github.com/LLNL/merlin.git - - -Create a virtualenv -******************* - -Merlin uses `virtualenvs `_ to manage -package dependencies which can be installed via Pip, Python's default -package manager. - -More documentation about using Virtualenvs with Merlin can be found at -:doc:`Using Virtualenvs with Merlin <./virtualenv>`. - -To create a new virtualenv and activate it: - -.. code:: bash - - $ python3 -m venv venv_merlin_$SYS_TYPE_py3_6 - $ source venv_merlin_$SYS_TYPE_py3/bin/activate # Or activate.csh for .cshrc - - -Install Python Package Dependencies -************************************ - -Merlin uses Pip to manage Python dependencies. Merlin dependencies can be -found in the requirements directory in the Merlin repository. - -To install the standard set of dependencies run: - -.. code:: bash - - (merlin3_7) $ pip install -r requirements.txt - -This will install all the required dependencies for Merlin and development -development dependencies. - - -Installing Merlin -******************* - -Merlin can be installed in editable mode. From within the Merlin repository: - -.. code:: bash - - (merlin3_7) $ pip install -e . - -Any changes made to the Merlin source code should automatically reflect in the -virtualenv. - -.. tip:: If changes to Merlin's source code do not reflect when running Merlin - try running `pip install -e .` from within the Merlin repository. diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 3776466d3..000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,83 +0,0 @@ -.. image:: ../images/merlin.png - -Merlin Documentation -==================== -Merlin is a tool for running machine learning based workflows. The goal of -Merlin is to make it easy to build, run, and process the kinds of large -scale HPC workflows needed for cognitive simulation. - -Merlin Overview ---------------- -Merlin is a distributed task queuing system, designed to allow complex -HPC workflows to scale to large numbers of simulations -(we've done 100 Million on the Sierra Supercomputer). - -Why would you want to run that many simulations? -To become your own Big Data generator. - -Data sets of this size can be large enough to train deep neural networks -that can mimic your HPC application, to be used for such -things as design optimization, uncertainty quantification and statistical -experimental inference. Merlin's been used to study inertial confinement -fusion, extreme ultraviolet light generation, structural mechanics and -atomic physics, to name a few. - -How does it work? - -In essence, Merlin coordinates complex workflows through a persistent -external queue server that lives outside of your HPC systems, but that -can talk to nodes on your cluster(s). As jobs spin up across your ecosystem, -workers on those allocations pull work from a central server, which -coordinates the task dependencies for your workflow. Since this coordination -is done via direct connections to the workers (i.e. not through a file -system), your workflow can scale to very large numbers of workers, -which means a very large number of simulations with very little overhead. - -Furthermore, since the workers pull their instructions from the central -server, you can do a lot of other neat things, like having multiple -batch allocations contribute to the same work (think surge computing), or -specialize workers to different machines (think CPU workers for your -application and GPU workers that train your neural network). Another -neat feature is that these workers can add more work back to central -server, which enables a variety of dynamic workflows, such as may be -necessary for the intelligent sampling of design spaces or reinforcement -learning tasks. - -Merlin does all of this by leveraging some key HPC and cloud computing -technologies, building off open source components. It uses -`maestro `_ to -provide an interface for describing workflows, as well as for defining -workflow task dependencies. It translates those dependencies into concrete -tasks via `celery `_, -which can be configured for a variety of backend -technologies (`rabbitmq `_ and -`redis `_ are currently supported). Although not -a hard dependency, we encourage the use of -`flux `_ for interfacing with -HPC batch systems, since it can scale to a very large number of jobs. - -The integrated system looks a little something like this: - -.. image:: ../images/merlin_arch.png - -For more details, check out the rest of the documentation. - -Need help? `merlin@llnl.gov `_ - -.. toctree:: - :maxdepth: 2 - - tutorial - getting_started - faq - merlin_commands - merlin_workflows - merlin_specification - merlin_config - merlin_variables - merlin_server - celery_overview - virtualenv - spack - merlin_developer - docker diff --git a/docs/source/merlin_commands.rst b/docs/source/merlin_commands.rst deleted file mode 100644 index 1baa0e7a5..000000000 --- a/docs/source/merlin_commands.rst +++ /dev/null @@ -1,480 +0,0 @@ -Command line -============ - -The merlin executable defines a number of commands to create tasks, -launch workers to run the tasks and remove tasks from the task server. -The tasks are communicated to a task server, or broker, that are then -requested by workers on an allocation to run. The celery python module -is used to implement the tasks and worker functionality. - - -Help (``merlin --help``) ------------------------- - -Descriptions of the Merlin commands are outputted when the ``-h`` or -``--help`` commands are used. - -.. code:: bash - - $ merlin [] --help - - -Version (``merlin --version``) ------------------------------- - -See the version by using the ``--version`` or ``-v`` flag. - -.. code:: bash - - $ merlin --version - - -Log Level (``merlin -lvl debug``) ---------------------------------- -More information, generally pertaining to bugs, can be output by increasing the logging level -using the ``-lvl`` or ``--level`` argument. - -Options for the level argument are: debug, info, warning, error. - -.. code:: bash - - $ merlin -lvl debug run - - -Create the Config File (``merlin config``) ------------------------------------------- - -Create a default config file in the ${HOME}/.merlin directory using the ``config`` command. This file -can then be edited for your system configuration. - -.. code:: bash - - $ merlin config [--task_server] [--output_dir ] [--broker ] - -The ``--task_server`` option will select the appropriate configuration for the -given task server. Currently only celery is implemented. - -The ``--output_dir`` or ``-o`` will output the configuration in the given directory. -This file can then be edited and copied into ${HOME}/.merlin. - -The ``--broker`` command will write the initial ``app.yaml`` config file -for a ``rabbitmq`` or ``redis`` broker. The default is ``rabbitmq``. -The backend will be ``redis`` in -both cases. The redis backend in the ``rabbitmq`` config shows the -use on encryption for the backend. - - -Generate working examples (``merlin example``) ----------------------------------------------- - -If you want to run an example workflow, use Merlin's ``merlin example``: - -.. code:: bash - - $ merlin example list - -This will list the available example workflows and a description for each one. To -select one: - -.. code:: bash - - $ merlin example - -This will copy the example workflow to the current working directory. It is -possible to specify another path to copy to. - -.. code:: bash - - $ merlin example -p path/to/dir - -If the specified directory does not exist Merlin will automatically create it. - -This will generate the example workflow at the specified location, ready to be run. - - -Information (``merlin info``) ------------------------------ - -Information about your merlin and python configuration can be printed out by using the -``info`` command. This is helpful for debugging. Included in this command -is a server check which will check for server connections. The connection -check will timeout after 60 seconds. - -.. code:: bash - - $ merlin info - - -Monitor (``merlin monitor``) ----------------------------- -Batch submission scripts may not keep the batch allocation alive -if there is not a blocking process in the submission script. The -``merlin monitor`` command addresses this by providing a blocking process that -checks for tasks in the queues every (sleep) seconds. When the queues are empty, the -monitor will query celery to see if any workers are still processing tasks from the -queues. If no workers are processing any tasks from the queues and the queues are empty, -the blocking process will exit and allow the allocation to end. - -The ``monitor`` function will check for celery workers for up to -10*(sleep) seconds before monitoring begins. The loop happens when the -queue(s) in the spec contain tasks, but no running workers are detected. -This is to protect against a failed worker launch. - -.. code:: bash - - $ merlin monitor [--steps ] [--vars >] [--sleep ][--task_server celery] - -Use the ``--steps`` option to identify specific steps in the specification that you want to query. - -The ``--vars`` option will specify desired Merlin variable values to override -those found in the specification. The list is space-delimited and should be given after -the input yaml file. -``Example: --vars LEARN=path/to/new_learn.py EPOCHS=3`` - -The ``--sleep`` argument is the duration in seconds between checks -for workers. The default is 60 seconds. - -The only currently available option for ``--task_server`` is celery, which is the default when this flag is excluded. - - -Purging Tasks (``merlin purge``) --------------------------------- - -Once the merlin run command succeeds, the tasks are now on the task server -waiting to be run by the workers. If you would like to remove the tasks from -the server, then use the purge command. - -.. attention:: - - Any tasks reserved by workers will not be purged from the queues. All - workers must be first stopped so the tasks can be returned to the task - server and then they can be purged. - - You probably want to use ``merlin stop-workers`` first. - -To purge all tasks in all queues defined by the workflow yaml file from the -task server, run: - -.. code:: bash - - $ merlin purge [-f] [--steps ] [--vars >] - -This will ask you if you would like to remove the tasks, you can use the -``-f`` option if you want to skip this. - -If you have different queues in your workflow yaml file, you can -choose which queues are purged by using the ``--steps`` argument and -giving a space-delimited list of steps. - -.. code:: bash - - $ merlin purge --steps step1 step2 - -The ``--vars`` option will specify desired Merlin variable values to override -those found in the specification. The list is space-delimited and should be given after -the input yaml file. -``Example: --vars QUEUE_NAME=new_queue EPOCHS=3`` - - -.. _query-workers: - -Searching for any workers (``merlin query-workers``) ----------------------------------------------------- - -If you want to see all workers that are currently connected to -the task server you can use: - -.. code:: bash - - $ merlin query-workers - -This will broadcast a command to all connected workers and print -the names of any that respond and the queues they're attached to. -This is useful for interacting with workers, such as via -``merlin stop-workers --workers``. - -The ``--queues`` option will look for workers associated with the -names of the queues you provide here. For example, if you want to -see the names of all workers attached to the queues named ``demo`` -and ``merlin`` you would use: - -.. code-block:: - - merlin query-workers --queues demo merlin - -The ``--spec`` option will query for workers defined in the spec -file you provide. For example, if ``simworker`` and ``nonsimworker`` -are defined in a spec file called ``example_spec.yaml`` then to query -for these workers you would use: - -.. code-block:: - - merlin query-workers --spec example_spec.yaml - -The ``--workers`` option will query for workers based on the worker -names you provide here. For example, if you wanted to query a worker -named ``step_1_worker`` you would use: - -.. code-block:: - - merlin query-workers --workers step_1_worker - -This flag can also take regular expressions as input. For instance, -if you had several workers running but only wanted to find the workers -whose names started with ``step`` you would use: - -.. code-block:: - - merlin query-workers --workers ^step - - -Restart the workflow (``merlin restart``) ------------------------------------------ - -To restart a previously started merlin workflow, use the ``restart`` command -and the path to root of the merlin workspace that was generated during the -previously run workflow. This will define the tasks and queue -them on the task server also called the broker. - -.. code:: bash - - $ merlin restart [--local] - -Merlin currently writes file called ``MERLIN_FINISHED`` to the directory of each -step that was finished successfully. It uses this to determine which steps to -skip during execution of a workflow. - -The ``--local`` option will run tasks sequentially in your current shell. - - -Run the workflow (``merlin run``) ---------------------------------- - -To run the merlin workflow use the ``run`` command and the path to the -input yaml file ````. This will define the tasks and queue -them on the task server also called the broker. - -.. code:: bash - - $ merlin run [--local] [--vars >] [--samplesfile ] [--dry] - -The ``--local`` option will run tasks sequentially in your current shell. - -The ``--vars`` option will specify desired Merlin variable values to override -those found in the specification. The list is space-delimited and should be given after -the input yaml file. -``Example: --vars LEARN=path/to/new_learn.py EPOCHS=3`` - -The ``--samplesfile`` will allow the user to specify a file containing samples. Valid choices: .npy, -.csv, .tab. Should be given after the input yaml file. - -The ``--no-errors`` option is used for testing, it will silence the errors thrown -when flux is not present. - -Dry Run -^^^^^^^ - -'Dry run' means telling workers to create a study's workspace and all of its necessary -subdirectories and scripts (with variables expanded) without actually executing -the scripts. - -To dry-run a workflow, use ``--dry``: - -.. code:: bash - - $ merlin run --local --dry - -In a distributed fashion: - -.. code:: bash - - $ merlin run --dry ; merlin run-workers - -You can also specify dry runs from the workflow specification file: - -.. code:: yaml - - batch: - dry_run: True - -If you wish to execute a workflow after dry-running it, simply use ``restart``. - - -Run the Workers (``merlin run-workers``) ----------------------------------------- - -The tasks queued on the broker are run by a collection of workers. These -workers can be run local in the current shell or in parallel on a batch -allocation. -The workers are launched using the -``run-workers`` command which reads the configuration for the worker launch -from the ```` file. -The batch and merlin resources section are both used to configure the -worker launch. -The top level batch section can be overridden in the merlin -workers resource section. -Parallel workers should be scheduled using the system's batch scheduler. -Once the workers are running, tasks from the broker will be processed. - -To launch workers for your workflow: - -.. code:: bash - - $ merlin run-workers [--echo] [--worker-args ] [--steps ] [--vars >] - -The ``--echo`` option will echo the celery workers run command to stdout and not run any workers. - -The ``--worker-args`` option will pass the values, in quotes, to the celery workers. Should be given -after the input yaml file. - -The ``--steps`` option is the specific steps in the input yaml file you want to run the corresponding workers. -The default is 'all' steps. Should be given after the input yaml file. - -The ``--vars`` option will specify desired Merlin variable values to override -those found in the specification. The list is space-delimited and should be given after -the input yaml file. -``Example: --vars LEARN=path/to/new_learn.py EPOCHS=3`` - -An example of launching a simple celery worker using srun: - -.. code:: bash - - $ srun -n 1 celery -A merlin worker -l INFO - -A parallel batch allocation launch is configured to run a single worker -process per node. This worker process will then launch a number of worker -threads to process the tasks. The number of threads can be configured by -the users and will be the number of parallel jobs that can be run at once -on the allocation plus threads for any non-parallel tasks. -If there are 36 cores on a -node and all the tasks are single core, the user may want to start 36 -threads per node. If the parallel jobs uses 8 tasks, then the user should run -4 or 5 threads. For the celery workers the number of threads is set using -the ``--concurrency`` argument, see the :ref:`celery-config` section. - -A full SLURM batch submission script to run the workflow on 4 nodes is -shown below. - -.. code:: bash - - #!/bin/bash - #SBATCH -N 4 - #SBATCH -J Merlin - #SBATCH -t 30:00 - #SBATCH -p pdebug - #SBATCH --mail-type=ALL - #SBATCH -o merlin_workers_%j.out - - # Assumes you are run this in the same dir as the yaml file. - YAML_FILE=input.yaml - - # Source the merlin virtualenv - source /bin/activate - - # Remove all tasks from the queues for this run. - #merlin purge -f ${YAML_FILE} - - # Submit the tasks to the task server - merlin run ${YAML_FILE} - - # Print out the workers command - merlin run-workers ${YAML_FILE} --echo - - # Run the workers on the allocation - merlin run-workers ${YAML_FILE} - - # Delay until the workers cease running - merlin monitor - - -Status (``merlin status``) --------------------------- -.. code:: bash - - $ merlin status [--steps ] [--vars >] [--csv ] [--task_server celery] - -Use the ``--steps`` option to identify specific steps in the specification that you want to query. - -The ``--vars`` option will specify desired Merlin variable values to override -those found in the specification. The list is space-delimited and should be given after -the input yaml file. -``Example: --vars LEARN=path/to/new_learn.py EPOCHS=3`` - -The ``--csv`` option takes in a filename, to dump status reports to. - -The only currently available option for ``--task_server`` is celery, which is the default when this flag is excluded. - - -.. _stop-workers: - -Stopping workers (``merlin stop-workers``) ------------------------------------------- - -To send out a stop signal to some or all connected workers, use: - -.. code:: bash - - $ merlin stop-workers [--spec ] [--queues ] [--workers ] [--task_server celery] - - -The default behavior will send a stop to all connected workers across all workflows, -having them shutdown softly. - -The ``--spec`` option targets only workers named in the ``merlin`` block of the spec file. - -The ``--queues`` option allows you to pass in the names of specific queues to stop. For example: - -.. code:: bash - - # Stop all workers on these queues, no matter their name - $ merlin stop-workers --queues queue1 queue2 - -The ``--workers`` option allows you to pass in regular expressions of names of workers to stop: - -.. code:: bash - - # Stop all workers whose name matches this pattern, no matter the queue - # Note the ".*" convention at the start, per regex - $ merlin stop-workers --workers ".*@my_other_host*" - -The only currently available option for ``--task_server`` is celery, which is the default when this flag is excluded. - -.. attention:: - - If you've named workers identically (you shouldn't) - only one might get the signal. In this case, you can send it - again. - -Hosting Local Server (``merlin server``) ----------------------------------------- - -To create a local server for merlin to connect to. Merlin server creates and configures a server on the current directory. -This allows multiple instances of merlin server to exist for different studies or uses. - -The ``init`` subcommand initalizes a new instance of merlin server. - -The ``status`` subcommand checks to the status of the merlin server. - -The ``start`` subcommand starts the merlin server. - -The ``stop`` subcommand stops the merlin server. - -The ``restart`` subcommand performs stop command followed by a start command on the merlin server. - -The ``config`` subcommand edits configurations for the merlin server. There are multiple flags to allow for different configurations. - -- The ``-ip IPADDRESS, --ipaddress IPADDRESS`` option set the binded IP address for merlin server. -- The ``-p PORT, --port PORT`` option set the binded port for merlin server. -- The ``-pwd PASSWORD, --password PASSWORD`` option set the password file for merlin server. -- The ``--add-user USER PASSWORD`` option add a new user for merlin server. -- The ``--remove-user REMOVE_USER`` option remove an exisiting user from merlin server. -- The ``-d DIRECTORY, --directory DIRECTORY`` option set the working directory for merlin server. -- The ``-ss SNAPSHOT_SECONDS, --snapshot-seconds SNAPSHOT_SECONDS`` option set the number of seconds before each snapshot. -- The ``-sc SNAPSHOT_CHANGES, --snapshot-changes SNAPSHOT_CHANGES`` option set the number of database changes before each snapshot. -- The ``-sf SNAPSHOT_FILE, --snapshot-file SNAPSHOT_FILE`` option set the name of snapshots. -- The ``-am APPEND_MODE, --append-mode APPEND_MODE`` option set the appendonly mode. Options are always, everysec, no. -- The ``-af APPEND_FILE, --append-file APPEND_FILE`` option set the filename for server append/change file. - -More information can be found on :doc:`Merlin Server <./merlin_server>` - - diff --git a/docs/source/merlin_config.rst b/docs/source/merlin_config.rst deleted file mode 100644 index 599a50413..000000000 --- a/docs/source/merlin_config.rst +++ /dev/null @@ -1,303 +0,0 @@ -Configuration -============= - -This section provides documentation for configuring Merlin's connections with task servers and results backends. - - -Merlin server configuration ---------------------------- - -Merlin works best configuring celery to run with a RabbitMQ_ broker and a redis_ -backend. Merlin uses celery chords which require a results backend be configured. The -Amqp (rpc Rabbitmq) server does not support chords but the Redis, Database, Memcached -and more, support chords. - -.. _redis: https://redis.io/ -.. _RabbitMQ: https://www.rabbitmq.com/ - -Merlin's configuration is controlled by an app.yaml file, such as the one below: - -.. literalinclude:: app_config/app_amqp.yaml - -The default location for the app.yaml is in the merlin repo under the -merlin/config directory. This default can be overridden by files in one of two -other locations. The current working directory is first checked for the -app.yaml file, then the user's ``~/.merlin`` directory is checked. - -``broker/name``: can be ``rabbitmq``, ``redis``, or ``redis+sock``. As their names imply, -``rabbitmq`` will use RabbitMQ_ as a task broker (preferred for multi user configurations), -``redis`` will use redis_ as a task broker, and ``redis+sock`` will connect to a redis_ task broker -using a socket. - - -Broker: ``rabbitmq``, ``amqps``, ``amqp`` ------------------------------------------ -Merlin constructs the following connection string from the relevant options in the -``broker`` section of the app.yaml file. If the ``port`` argument is not defined, -the default rabbitmq TLS port, 5671, will be used. See the :ref:`broker_rabbitmq_ssl` -section for more info about security with this broker. When the ``broker`` -is ``amqp``, the default port will be 5672. - - -| The prototype url for this configuration is: -| ``{conn}://{username}:{password}@{server}:{port}/{vhost}`` - -Here ``conn`` is ``amqps`` (with ssl) when ``name`` is ``rabbitmq`` or ``amqps`` and -``amqp`` (without ssl) when name is ``amqp``. - -:: - - broker: - name: rabbitmq - #username: # defaults to your username unless changed here - password: ~/.merlin/rabbit-password - # server URL - server: server.domain.com - #vhost: # defaults to your username unless changed here - -Broker: ``redis`` ------------------ -Merlin constructs the following connection string from the relevant options in the `broker` section of the app.yaml file. - -| The prototype url for this configuration is: -| ``redis://:{password}@{server}:{port}/{db_num}`` - -:: - - broker: - name: redis - server: localhost - port: 6379 - -Broker: ``rediss`` ------------------- -Newer versions of Redis (version 6 or greater) can be configured with ssl. The -``rediss`` name is used to enable this support. See the :ref:`broker_redis_ssl` -section for more info. - -| The prototype url for this configuration is: -| ``rediss://:{password}@{server}:{port}/{db_num}`` - -:: - - broker: - name: rediss - server: localhost - port: 6379 - -Broker: ``redis+socket`` ------------------------- -Merlin constructs the following connection string from the relevant options in the -``broker`` section of the app.yaml file. - -| The prototype url for this configuration is: -| ``redis+socket://{path}?virtual_host={db_num}`` - -:: - - broker: - name: redis+socket - path: /tmp/username/redis.sock - db_num: 0 - -Broker: ``url`` ---------------- - -A ``url`` option is available to specify the broker connection url, in this -case the server name is ignored. The url must include all -the entire connection url except the ssl if the broker name is recognized -by the ssl processing system. -Currently the ssl system will only configure the Rabbitmq and Redis -servers. - -| The prototype url for this configuration is: -| ``{url}`` - -:: - - broker: - url: redis://localhost:6379/0 - -Broker: Security ----------------- - -.. _broker_rabbitmq_ssl: - -Security with RabbitMQ_ -_______________________ - -Merlin can only be configured to communicate with RabbitMQ_ over an SSL connection and -does not permit use of a RabbitMQ_ server configured_ without SSL. The default value -of the broker_use_ssl keyword is True. The keys can be given in the broker config as -show below. - - -.. _configured : https://www.rabbitmq.com/ssl.html - -:: - - broker: - # can be redis, redis+sock, or rabbitmq - name: rabbitmq - #username: # defaults to your username unless changed here - password: ~/.merlin/rabbit-password - # server URL - server: server.domain.com - - ### for rabbitmq, redis+sock connections ### - #vhost: # defaults to your username unless changed here - - # ssl security - keyfile: /var/ssl/private/client-key.pem - certfile: /var/ssl/amqp-server-cert.pem - ca_certs: /var/ssl/myca.pem - # This is optional and can be required, optional or none - # (required is the default) - cert_reqs: required - - - -This results in a value for broker_use_ssl given below: - -:: - - broker_use_ssl = { - 'keyfile': '/var/ssl/private/client-key.pem', - 'certfile': '/var/ssl/amqp-server-cert.pem', - 'ca_certs': '/var/ssl/myca.pem', - 'cert_reqs': ssl.CERT_REQUIRED - } - - -.. _broker_redis_ssl: - -Security with redis_ -____________________ - - -The same ssl config and resulting ssl_use_broker can be used with the ``rediss://`` -url when using a redis server version 6 or greater with ssl_. - -.. _ssl : https://redis.io/topics/encryption - -:: - - broker: - name: rediss - #username: # defaults to your username unless changed here - # server URL - server: server.domain.com - port: 6379 - db_num: 0 - - - # ssl security - keyfile: /var/ssl/private/client-key.pem - certfile: /var/ssl/amqp-server-cert.pem - ca_certs: /var/ssl/myca.pem - # This is optional and can be required, optional or none - # (required is the default) - cert_reqs: required - - -The resulting ``broker_use_ssl`` configuration for a ``rediss`` server is given below. - -:: - - broker_use_ssl = { - 'ssl_keyfile': '/var/ssl/private/client-key.pem', - 'ssl_certfile': '/var/ssl/amqp-server-cert.pem', - 'ssl_ca_certs': '/var/ssl/myca.pem', - 'ssl_cert_reqs': ssl.CERT_REQUIRED - } - - - -Results backend: ``redis`` --------------------------- -Merlin constructs the following connection string from relevant options in the -`results_backend` section of the app.yaml file. - -| The prototype url for this configuration is: -| ``redis://:{password}{server}:{port}/{db_num}`` - -:: - - results_backend: - name: redis - server: localhost - port: 6379 - -Results backend: ``rediss`` ---------------------------- -Newer versions of Redis (version 6 or greater) can be configured with ssl. The -``rediss`` name is used to enable this support. See the :ref:`results_redis_ssl` -section for more info. - -| The prototype url for this configuration is: -| ``rediss://:{password}{server}:{port}/{db_num}`` - -:: - - results_backend: - name: rediss - server: localhost - port: 6379 - -Results backend: ``url`` ------------------------- - -A ``url`` option is available to specify the results connection url, in this -case the server name is ignored. The url must include the entire connection url -including the ssl configuration. - -| The prototype url for this configuration is: -| ``{url}`` - -:: - - results_backend: - url: redis://localhost:6379/0 - -The ``url`` option can also be used to define a server that is not explicitly -handled by the merlin configuration system. - -:: - - results_backend: - url: db+postgresql://scott:tiger@localhost/mydatabase - -Resolving password fields -_________________________ - -The ``results_backend/password`` is interpreted in the following way. First, it is treated as an absolute path to a file containing your backend -password. If that path doesn't exist, it then looks for a file of that name under the directory defined under ``celery/certs``. If that file doesn't -exist, it then looks treats ``results_backend/password`` as the password itself. - -The ``broker/password`` is simply the full path to a file containing your password for the user defined by ``broker/username``. - -Results backend: Security -------------------------- - -.. _results_redis_ssl: - -Security with redis_ -____________________ - -Redis versions less than 6 do not natively support multiple users or SSL. We address security concerns here by redefining the core celery routine that communicates with -redis to encrypt all data it sends to redis and then decrypt anything it receives. Each user should have their own encryption key as defined by -``results_backend/encryption_key`` in the app.yaml file. Merlin will generate a key if that key does not yet exist. - -Redis versions 6 or greater can use the ssl keys as in the broker section. The ssl -config with redis (``rediss``) in the results backend is the placed in the -``redis_backend_use_ssl`` celery argument. -The values in this argument are the same as the broker. - -:: - - redis_backend_use_ssl = { - 'ssl_keyfile': '/var/ssl/private/client-key.pem', - 'ssl_certfile': '/var/ssl/amqp-server-cert.pem', - 'ssl_ca_certs': '/var/ssl/myca.pem', - 'ssl_cert_reqs': ssl.CERT_REQUIRED - } diff --git a/docs/source/merlin_developer.rst b/docs/source/merlin_developer.rst deleted file mode 100644 index e88352f3e..000000000 --- a/docs/source/merlin_developer.rst +++ /dev/null @@ -1,175 +0,0 @@ -Contributing -============ - -Welcome to the Merlin developer documentation! This section provides -instructions for contributing to Merlin. - -Getting Started -++++++++++++++++ - - -Follow the :doc:`Getting Started <./getting_started>` documentation to setup -your Merlin development environment. - -Once your development is setup create a branch: - -.. code-block:: bash - - $ git checkout -b feature//description - -.. note:: - - Other common types of branches besides feature are: ``bugfix``, - ``hotfix``, or ``refactor``. - - Select the branch type that best represents the development. - -Merlin follows a `gitflow workflow `_. -Updates to the develop branch are made via pull requests. - - -Developer Guide -+++++++++++++++ - -This section provides Merlin's guide for contributing features/bugfixes to -Merlin. - -Pull Request Checklist -++++++++++++++++++++++ - -.. warning:: All pull requests must pass ``make tests`` prior to consideration! - -To expedite review, please ensure that pull requests - -- Are from a meaningful branch name (e.g. ``feature/my_name/cool_thing``) - -- Are being merged into the `appropriate branch `_ - -- Include testing for any new features - - - unit tests in ``tests/unit`` - - integration tests in ``tests/integration`` - -- Include descriptions of the changes - - - a summary in the pull request - - details in ``CHANGELOG.md`` - -- Ran ``make fix-style`` to adhere to style guidelines - -- Pass ``make tests``; output included in pull request - -- Increment version number `appropriately `_ - - - in ``CHANGELOG.md`` - - in ``merlin.__init__.py`` - -- Have `squashed `_ commits - -Testing -+++++++ - -All pull requests must pass unit and integration tests. To ensure that they do run - -.. code-block:: bash - - $ make tests - -Python Code Style Guide -++++++++++++++++++++++++ - -This section documents Merlin's style guide. Unless otherwise specified, -`PEP-8 `_ -is the preferred coding style and `PEP-0257 `_ -for docstrings. - -.. note:: ``make fix-style`` will automatically fix any style issues. - -Merlin has style checkers configured. They can be run from the Makefile: - -.. code-block:: bash - - $ make check-style - -Adding New Features to YAML Spec File -+++++++++++++++++++++++++++++++++++++ - -In order to conform to Maestro's verification format introduced in Maestro v1.1.7, -we now use `json schema `_ validation to verify our spec -file. - -If you are adding a new feature to Merlin that requires a new block within the yaml spec -file or a new property within a block, then you are going to need to update the -merlinspec.json file located in the merlin/spec/ directory. You also may want to add -additional verifications within the specification.py file located in the same directory. - -.. note:: - If you add custom verifications beyond the pattern checking that the json schema - checks for, then you should also add tests for this verification in the test_specification.py - file located in the merlin/tests/unit/spec/ directory. Follow the steps for adding new - tests in the docstring of the TestCustomVerification class. - -Adding a New Property -********************* - -To add a new property to a block in the yaml file, you need to create a -template for that property and place it in the correct block in merlinspec.json. For -example, say I wanted to add a new property called ``example`` that's an integer within -the ``description`` block. I would modify the ``description`` block in the merlinspec.json file to look -like this: - -.. code-block:: json - - "DESCRIPTION": { - "type": "object", - "properties": { - "name": {"type": "string", "minLength": 1}, - "description": {"type": "string", "minLength": 1}, - "example": {"type": "integer", "minimum": 1} - }, - "required": ["name", "description"] - } - -If you need help with json schema formatting, check out the `step-by-step getting -started guide `_. - -That's all that's required of adding a new property. If you want to add your own custom -verifications make sure to create unit tests for them (see the note above for more info). - -Adding a New Block -****************** - -Adding a new block is slightly more complicated than adding a new property. You will not -only have to update the merlinspec.json schema file but also add calls to verify that -block within specification.py. - -To add a block to the json schema, you will need to define the template for that entire -block. For example, if I wanted to create a block called ``country`` with two -properties labeled ``name`` and ``population`` that are both required, it would look like so: - -.. code-block:: json - - "COUNTRY": { - "type": "object", - "properties": { - "name": {"type": "string", "minLength": 1}, - "population": { - "anyOf": [ - {"type": "string", "minLength": 1}, - {"type": "integer", "minimum": 1} - ] - } - }, - "required": ["name", "capital"] - } - -Here, ``name`` can only be a string but ``population`` can be both a string and an integer. -For help with json schema formatting, check out the `step-by-step getting started guide`_. - -The next step is to enable this block in the schema validation process. To do this we need to: - -#. Create a new method called verify_() within the MerlinSpec class -#. Call the YAMLSpecification.validate_schema() method provided to us via Maestro in your new method -#. Add a call to verify_() inside the verify() method - -If you add your own custom verifications on top of this, please add unit tests for them. diff --git a/docs/source/merlin_server.rst b/docs/source/merlin_server.rst deleted file mode 100644 index 23f6d4a1d..000000000 --- a/docs/source/merlin_server.rst +++ /dev/null @@ -1,72 +0,0 @@ -Merlin Server -============= -The merlin server command allows users easy access to containerized broker -and results servers for merlin workflows. This allows users to run merlin without -a dedicated external server. - -The main configuration will be stored in the subdirectory called "server/" by -default in the main merlin configuration "~/.merlin". However different server -images can be created for different use cases or studies just by simplying creating -a new directory to store local configuration files for merlin server instances. - -Below is an example of how merlin server can be utilized. - -First create and navigate into a directory to store your local merlin -configuration for a specific use case or study. - -.. code-block:: bash - - mkdir study1/ - cd study1/ - -Afterwards you can instantiate merlin server in this directory by running - -.. code-block:: bash - - merlin server init - -A main server configuration will be created in the ~/.merlin/server and a local -configuration will be created in a subdirectory called "merlin_server/" - -We should expect the following files in each directory - -.. code-block:: bash - - ~/study1$ ls ~/.merlin/server/ - docker.yaml merlin_server.yaml podman.yaml singularity.yaml - - ~/study1$ ls - merlin_server - - ~/study1$ ls merlin_server/ - redis.conf redis_latest.sif - -The main configuration in "~/.merlin/server" deals with defaults and -technical commands that might be used for setting up the merlin server -local configuration and its containers. Each container has their own -configuration file to allow users to be able to switch between different -containerized services freely. - -The local configuration "merlin_server" folder contains configuration files -specific to a certain use case or run. In the case above you can see that we have a -redis singularity container called "redis_latest.sif" with the redis configuration -file called "redis.conf". This redis configuration will allow the user to -configurate redis to their specified needs without have to manage or edit -the redis container. When the server is run this configuration will be dynamically -read, so settings can be changed between runs if needed. - -Once the merlin server has been initialized in the local directory the user will be allowed -to run other merlin server commands such as "run, status, stop" to interact with the -merlin server. A detailed list of commands can be found in the `Merlin Server Commands <./server/commands.html>`_ page. - -Note: Running "merlin server init" again will NOT override any exisiting configuration -that the users might have set or edited. By running this command again any missing files -will be created for the users with exisiting defaults. HOWEVER it is highly advised that -users back up their configuration in case an error occurs where configuration files are overriden. - -.. toctree:: - :maxdepth: 1 - :caption: Merlin Server Settings: - - server/configuration - server/commands diff --git a/docs/source/merlin_specification.rst b/docs/source/merlin_specification.rst deleted file mode 100644 index 71e041b33..000000000 --- a/docs/source/merlin_specification.rst +++ /dev/null @@ -1,358 +0,0 @@ -Workflow Specification -====================== - -The merlin input file or spec file is separated into several sections. An -annotated version is given below. - -.. note:: The Merlin input file is a yaml file and must adhere to yaml - syntax. The yaml spec relies on the indentation in the file. - -The input file can take a number of variables, beyond the examples shown here. -For a complete list and descriptions of the variables, -see :doc:`./merlin_variables`. - -.. code-block:: yaml - - #################################### - # Description Block (Required) - #################################### - # The description block is where the description of the study is placed. This - # section is meant primarily for documentation purposes so that when a - # specification is passed to other users they can glean a general understanding - # of what this study is meant to achieve. - #------------------------------- - # Required keys: - # name - Name of the study - # description - Description of what this study does. - #------------------------------- - # NOTE: You can add other keys to this block for custom documentation. Merlin - # currently only looks for the required set. - #################################### - description: - description: Run a scan through Merlin - name: MERLIN - - #################################### - # Batch Block (Required) - #################################### - # The batch system to use for each allocation - #------------------------------- - # Required keys: - # type - The scheduler type to use (local|slurm|flux|lsf) - # bank - The allocation bank - # queue - The batch queue - #################################### - batch: - type: flux - bank: testbank - queue: pbatch - flux_path: - flux_start_opts: - flux_exec: - flux_exec_workers: - launch_pre: - launch_args: - worker_launch: - shell: - # e.g. /bin/bash, /bin/tcsh, python, /usr/bin/env perl, etc. - nodes: # The number of nodes to use for all workers - This can be overridden in the workers config. - If this is unset the number of nodes will be - queried from the environment, failing that, the - number of nodes will be set to 1. - walltime: The total walltime of the batch allocation (hh:mm:ss or mm:ss or ss) - - - ##################################### - # Environment Block - #################################### - # The environment block is where items describing the study's environment are - # defined. This includes static information that the study needs to know about - # and dependencies that the workflow requires for execution. - #------------------------------- - # NOTE: This block isn't strictly required as a study may not depend on anything. - ######################################################################## - env: - #------------------------------- - # Variables - #------------------------------- - # Values that the workflow substitutes into steps and are similar in - # concept to Unix environment variables. These variables are not dependent - # on values in the environment and so are more portable. - # - # Note that variables defined here can alter the runtime shell - # variable definitions. - # Do not define a variable named "shell" here. - #------------------------------- - variables: - # Set a custom output path for the study workspace. This path is where - # Merlin will place all temporary files, state files, and any output. - # The resulting path is usually a timestamped folder within OUTPUT_PATH - # and in this case would be - # './sample_output/merlin/merlin_sample1_'. - # NOTE: If not specified, - # OUTPUT_PATH is assumed to be the path where Merlin was launched from. - OUTPUT_PATH: ./sample_output/merlin # OUTPUT_PATH is a keyword - # variable that Merlin looks for - # to replace with the study - # directory created for the - # ensemble - - #################################### - # Study Block (Required) - #################################### - # The study block is where the steps in the workflow are defined. This section - # of the specification represents the unexpanded set of tasks that the study - # is composed of. - # - # - # A description of what gets turned into tasks and what type of task - # would be a good addition - # - # study lists the various steps, each of which has these fields - # name: step name - # description: what the step does - # run: - # cmd: the command to run for multilines use cmd: | - # The $(LAUNCHER) macro can be used to substitute a parallel launcher - # based on the batch:type:. - # It will use the nodes and procs values for the task. - # restart: The (optional) restart command to run when $(MERLIN_RESTART) - # is the exit code. The command in cmd will be run if the exit code - # is $(MERLIN_RETRY). - # task_queue: the queue to assign the step to (optional. default: merlin) - # shell: the shell to use for the command (eg /bin/bash /usr/bin/env python) - # (optional. default: /bin/bash) - # depends: a list of steps this step depends upon (ie parents) - # procs: The total number of MPI tasks - # nodes: The total number of MPI nodes - # walltime: The total walltime of the run (hh:mm:ss, mm:ss or ss) (not available in lsf) - # cores per task: The number of hardware threads per MPI task - # gpus per task: The number of GPUs per MPI task - # SLURM specific run flags: - # slurm: Verbatim flags only for the srun parallel launch (srun -n -n ) - # FLUX specific run flags: - # flux: Verbatim flags for the flux parallel launch (flux mini run ) - # LSF specific run flags: - # bind: Flag for MPI binding of tasks on a node - # num resource set: Number of resource sets - # launch_distribution : The distribution of resources (default: plane:{procs/nodes}) - # exit_on_error: Flag to exit on error (default: 1) - # lsf: Verbatim flags only for the lsf parallel launch (jsrun ... - ####################################################################### - study: - - name: runs1 - description: Run on alloc1 - run: - cmd: $(LAUNCHER) echo "$(VAR1) $(VAR2)" > simrun.out - nodes: 1 - procs: 1 - task_queue: queue1 - shell: /bin/bash - - - name: post-process - description: Post-Process runs on alloc1 - run: - cmd: | - cd $(runs1.workspace)/$(MERLIN_SAMPLE_PATH) - - # exit $(MERLIN_RESTART) # syntax to send a restart error code - # This will rerun the cmd command. Users can also use $(MERLIN_RETRY). - nodes: 1 - procs: 1 - depends: [runs1] - task_queue: queue1 - - - name: runs2 - description: Run on alloc2 - run: - cmd: | - touch learnrun.out - $(LAUNCHER) echo "$(VAR1) $(VAR2)" >> learnrun.out - exit $(MERLIN_RESTART) # syntax to send a restart error code - # exit $(MERLIN_RETRY) # syntax to send a retry error code to - # run the cmd command again instead of the restart command. - restart: | - # Command to run if the $(MERLIN_RESTART) exit code is used - touch learnrunrs.out - $(LAUNCHER) echo "$(VAR1) $(VAR2)" >> learnrunrs.out - exit $(MERLIN_SUCCESS) # syntax to send a success code - nodes: 1 - procs: 1 - task_queue: lqueue - max_retries: 3 # maximum number of retries - retry_delay: 10 # delay retry for N seconds (default 1) - batch: - type: - - - name: monitor - description: Monitor on alloc1 - run: - cmd: date > monitor.out - nodes: 1 - procs: 1 - task_queue: mqueue - - #################################### - # Parameter Block (Required) - #################################### - # The parameter block contains all the things we'd like to vary in the study. - # Currently, there are two modes of operating in the specification: - # 1. If a parameter block is specified, the study is expanded and considered a - # parameterized study. - # 2. If a parameter block is not specified, the study is treated as linear and - # the resulting study is not expanded. - # - # There are three keys per parameter: - # 1. A list of values that the parameter takes. - # 2. A label that represents a "pretty printed" version of the parameter. The - # parameter values is specified by the '%%' moniker (for example, for SIZE -- - # when SIZE is equal to 10, the label will be 'SIZE.10'). To access the label - # for SIZE, for example, the token '$(SIZE.label)' is used. - # Labels can take one of two forms: A single string with the '%%' marker or - # a list of per value labels (must be the same length as the list of values). - # - # NOTE: A specified parameter does not necessarily have to be used in every step - # or at all. If a parameter is specified and not used, it simply will not be - # factored into expansion or the naming of expanded steps or their workspaces. - # NOTE: You can also specify custom generation of parameters using a Python - # file containing the definition of a function as follows: - # - # 'def get_custom_generator():' - # - # The 'get_custom_generator' function is required to return a ParameterGenerator - # instance populated with custom filled values. In order to use the file, simply - # call Merlin using 'merlin run '. - ######################################################################## - global.parameters: - STUDY: - label: STUDY.%% - values: [MERLIN1, MERLIN2] - SIZE: - values : [10, 20] - label : SIZE.%% - ITERATIONS: - values : [10, 20] - label : ITER.%% - - #################################### - # Merlin Block (Required) - #################################### - # The merlin specific block will add any configuration to - # the DAG created by the study description. - # including task server config, data management and sample definitions. - # - # merlin will replace all SPECROOT instances with the directory where - # the input yaml was run. - ####################################################################### - merlin: - - #################################### - # Resource definitions - # - # Define the task server configuration and workers to run the tasks. - # - #################################### - resources: - task_server: celery - - # Flag to determine if multiple workers can pull tasks - # from overlapping queues. (default = False) - overlap: False - - # Customize workers. Workers can have any user-defined name (e.g., simworkers, learnworkers). - workers: - simworkers: - args: - steps: [runs1, post-process, monitor] # [all] when steps is omitted - nodes: - # A list of machines to run the given steps can be specified - # in the machines keyword. - # A full OUTPUT_PATH and the steps argument are required - # when using this option. Currently all machines in the - # list must have access to the OUTPUT_PATH. - machines: [host1, host2] - - learnworkers: - args: - steps: [runs2] - nodes: - # An optional batch section in the worker can override the - # main batch config. This is useful if other workers are running - # flux, but some component of the workflow requires the native - # scheduler or cannot run under flux. Another possibility is to - # have the default type as local and workers needed for flux or - # slurm steps. - batch: - type: local - machines: [host3] - - ################################################### - # Sample definitions - # - # samples file can be one of - # .npy (numpy binary) - # .csv (comma delimited: '#' = comment line) - # .tab (tab/space delimited: '#' = comment line) - ################################################### - samples: - column_labels: [VAR1, VAR2] - file: $(SPECROOT)/samples.npy - generate: - cmd: | - python $(SPECROOT)/make_samples.py -dims 2 -n 10 -outfile=$(INPUT_PATH)/samples.npy "[(1.3, 1.3, 'linear'), (3.3, 3.3, 'linear')]" - level_max_dirs: 25 - - #################################### - # User Block (Optional) - #################################### - # The user block allows other variables in the workflow file to be propagated - # through to the workflow (including in variables .partial.yaml and .expanded.yaml). - # User block uses yaml anchors, which defines a chunk of configuration and use - # their alias to refer to that specific chunk of configuration elsewhere. - ####################################################################### - user: - study: - run: - hello: &hello_run - cmd: | - python3 $(HELLO) -outfile hello_world_output_$(MERLIN_SAMPLE_ID).json $(X0) $(X1) $(X2) - max_retries: 1 - collect: &collect_run - cmd: | - echo $(MERLIN_GLOB_PATH) - echo $(hello.workspace) - ls $(hello.workspace)/X2.$(X2)/$(MERLIN_GLOB_PATH)/hello_world_output_*.json > files_to_collect.txt - spellbook collect -outfile results.json -instring "$(cat files_to_collect.txt)" - translate: &translate_run - cmd: spellbook translate -input $(collect.workspace)/results.json -output results.npz -schema $(FEATURES) - learn: &learn_run - cmd: spellbook learn -infile $(translate.workspace)/results.npz - make_samples: &make_samples_run - cmd: spellbook make-samples -n $(N_NEW) -sample_type grid -outfile grid_$(N_NEW).npy - predict: &predict_run - cmd: spellbook predict -infile $(make_new_samples.workspace)/grid_$(N_NEW).npy -outfile prediction_$(N_NEW).npy -reg $(learn.workspace)/random_forest_reg.pkl - verify: &verify_run - cmd: | - if [[ -f $(learn.workspace)/random_forest_reg.pkl && -f $(predict.workspace)/prediction_$(N_NEW).npy ]] - then - touch FINISHED - exit $(MERLIN_SUCCESS) - else - exit $(MERLIN_SOFT_FAIL) - fi - python3: - run: &python3_run - cmd: | - print("OMG is this in python?") - print("Variable X2 is $(X2)") - shell: /usr/bin/env python3 - python2: - run: &python2_run - cmd: | - print "OMG is this in python2? Change is bad." - print "Variable X2 is $(X2)" - shell: /usr/bin/env python2 diff --git a/docs/source/merlin_variables.rst b/docs/source/merlin_variables.rst deleted file mode 100644 index f8ea7fce7..000000000 --- a/docs/source/merlin_variables.rst +++ /dev/null @@ -1,397 +0,0 @@ -Variables -========= - -There are a number of variables which can be placed in a merlin input .yaml -file that can control workflow execution, such as via string expansion and -control flow. - -.. note:: Only user variables and ``OUTPUT_PATH`` may be reassigned or overridden from the command line. - -Directory structure context ---------------------------- -The directory structure of merlin output looks like this: - -.. code:: - - SPECROOT - - - ... - - OUTPUT_PATH - MERLIN_WORKSPACE - MERLIN_INFO - .orig.yaml - .partial.yaml - .expanded.yaml - .workspace - WORKSPACE - - -Reserved variables ------------------- -.. list-table:: Study variables that Merlin uses. May be referenced within a specification file, but not reassigned or overridden. - :widths: 25 50 25 - :header-rows: 1 - - * - Variable - - Description - - Example Expansion - - * - ``$(SPECROOT)`` - - Directory path of the specification file. - - - :: - - /globalfs/user/merlin_workflows - - * - ``$(OUTPUT_PATH)`` - - Directory path the study output will be written to. If not defined - will default to the current working directory. May be reassigned or - overridden. - - - :: - - ./studies - - * - ``$(MERLIN_TIMESTAMP)`` - - The time a study began. May be used as a unique identifier. - - - :: - - "YYYYMMDD-HHMMSS" - - * - ``$(MERLIN_WORKSPACE)`` - - Output directory generated by a study at ``OUTPUT_PATH``. Ends with - ``MERLIN_TIMESTAMP``. - - - :: - - $(OUTPUT_PATH)/ensemble_name_$(MERLIN_TIMESTAMP) - - * - ``$(WORKSPACE)`` - - The workspace directory for a single step. - - - :: - - $(OUTPUT_PATH)/ensemble_name_$(MERLIN_TIMESTAMP)/step_name/`` - - * - ``$(MERLIN_INFO)`` - - Directory within ``MERLIN_WORKSPACE`` that holds the provenance specs and sample generation results. - Commonly used to hold ``samples.npy``. - - - :: - - $(MERLIN_WORKSPACE)/merlin_info/ - - * - ``$(MERLIN_SAMPLE_ID)`` - - Sample index in an ensemble - - - :: - - 0 1 2 3 - - * - ``$(MERLIN_SAMPLE_PATH)`` - - Path in the sample directory tree to a sample's directory, i.e. where the - task is actually run. - - - :: - - /0/0/0/ /0/0/1/ /0/0/2/ /0/0/3/ - - * - ``$(MERLIN_GLOB_PATH)`` - - All of the directories in a simulation tree as a glob (*) string - - - :: - - /*/*/*/* - - * - ``$(MERLIN_PATHS_ALL)`` - - A space delimited string of all of the paths; - can be used as is in bash for loop for instance with: - - .. code-block:: bash - - for path in $(MERLIN_PATHS_ALL) - do - ls $path - done - - - - :: - - 0/0/0 - 0/0/1 - 0/0/2 - 0/0/3 - - * - ``$(MERLIN_SAMPLE_VECTOR)`` - - Vector of merlin sample values - - - :: - - $(SAMPLE_COLUMN_1) $(SAMPLE_COLUMN_2) ... - - * - ``$(MERLIN_SAMPLE_NAMES)`` - - Names of merlin sample values - - - :: - - SAMPLE_COLUMN_1 SAMPLE_COLUMN_2 ... - - * - ``$(MERLIN_SPEC_ORIGINAL_TEMPLATE)`` - - Copy of original yaml file passed to ``merlin run``. - - - :: - - $(MERLIN_INFO)/*.orig.yaml - - * - ``$(MERLIN_SPEC_EXECUTED_RUN)`` - - Parsed and processed yaml file with command-line variable substitutions included. - - - :: - - $(MERLIN_INFO)/*.partial.yaml - - * - ``$(MERLIN_SPEC_ARCHIVED_COPY)`` - - Archive version of ``MERLIN_SPEC_EXECUTED_RUN`` with all variables and paths fully resolved. - - - :: - - $(MERLIN_INFO)/*.expanded.yaml - - -The ``LAUNCHER`` and ``VLAUNCHER`` Variables -+++++++++++++++++++++++++++++++++++++++++++++++ - -``$(LAUNCHER)`` is a special case of a reserved variable since it's value *can* be changed. -It serves as an abstraction to launch a job with parallel schedulers like :ref:`slurm`, -:ref:`lsf`, and :ref:`flux` and it can be used within a step command. For example, -say we start with this run cmd inside our step: - -.. code:: yaml - - run: - cmd: srun -N 1 -n 3 python script.py - -We can modify this to use the ``$(LAUNCHER)`` variable like so: - -.. code:: yaml - - batch: - type: slurm - - run: - cmd: $(LAUNCHER) python script.py - nodes: 1 - procs: 3 - -In other words, the ``$(LAUNCHER)`` variable would become ``srun -N 1 -n 3``. - -Similarly, the ``$(VLAUNCHER)`` variable behaves similarly to the ``$(LAUNCHER)`` variable. -The key distinction lies in its source of information. Instead of drawing certain configuration -options from the ``run`` section of a step, it retrieves specific shell variables. These shell -variables are automatically generated by Merlin when you include the ``$(VLAUNCHER)`` variable -in a step command, but they can also be customized by the user. Currently, the following shell -variables are: - -.. list-table:: VLAUNCHER Variables - :widths: 25 50 25 - :header-rows: 1 - - * - Variable - - Description - - Default - - * - ``${MERLIN_NODES}`` - - The number of nodes - - 1 - - * - ``${MERLIN_PROCS}`` - - The number of tasks/procs - - 1 - - * - ``${MERLIN_CORES}`` - - The number of cores per task/proc - - 1 - - * - ``${MERLIN_GPUS}`` - - The number of gpus per task/proc - - 0 - -Let's say we have the following defined in our yaml file: - -.. code:: yaml - - batch: - type: flux - - run: - cmd: | - MERLIN_NODES=4 - MERLIN_PROCS=2 - MERLIN_CORES=8 - MERLIN_GPUS=2 - $(VLAUNCHER) python script.py - -The ``$(VLAUNCHER)`` variable would be substituted to ``flux run -N 4 -n 2 -c 8 -g 2``. - -User variables -------------------- -Variables defined by a specification file in the ``env`` section, as in this example: - -.. code-block:: yaml - - env: - variables: - ID: 42 - EXAMPLE_VAR: hello - -As long as they're defined in order, you can nest user variables like this: - -.. code-block:: yaml - - env: - variables: - EXAMPLE_VAR: hello - WORKER_NAME: $(EXAMPLE_VAR)_worker - -Like all other Merlin variables, user variables may be used anywhere (as a yaml key or value) within a specification as below: - -.. code-block:: yaml - - cmd: echo "$(EXAMPLE_VAR), world!" - ... - $(WORKER_NAME): - args: ... - -If you want to programmatically define the study name, you can include variables -in the ``description.name`` field as long as it makes a valid filename: - -.. code-block:: yaml - - description: - name: my_$(EXAMPLE_VAR)_study_$(ID) - description: example of programmatic study name - -The above would produce a study called ``my_hello_study_42``. - -Environment variables ---------------------- -Merlin expands Unix environment variables for you. The values of the user variables below would be expanded: - -.. code-block:: yaml - - env: - variables: - MY_HOME: ~/ - MY_PATH: $PATH - USERNAME: ${USER} - -However, Merlin leaves environment variables found in shell scripts (think ``cmd`` and ``restart``) alone. -So this step: - -.. code-block:: yaml - - - name: step1 - description: an example - run: - cmd: echo $PATH ; echo $(MY_PATH) - -...would be expanded as: - -.. code-block:: yaml - - - name: step1 - description: an example - run: - cmd: echo $PATH ; echo /an/example/:/path/string/ - -Step return variables ------------------------------------ -.. list-table:: Special return code variables for task steps. - :widths: 25 50 25 - :header-rows: 1 - - * - Variable - - Description - - Example Usage - * - ``$(MERLIN_SUCCESS)`` - - This step was successful. Keep going to the next task. Default step - behavior if no exit code given. - - - :: - - echo "hello, world!" - exit $(MERLIN_SUCCESS) - - * - ``$(MERLIN_RESTART)`` - - Run this step's ``restart`` command, or re-run ``cmd`` if ``restart`` - is absent. The default maximum number of retries+restarts for any given step - is 30. You can override this by adding a ``max_retries`` field under the run - field in the specification. Issues a warning. Default will retry in 1 second. - To override the delay time, specify ``retry_delay``. - - - :: - - run: - cmd: | - touch my_file.txt - echo "hi mom!" >> my_file.txt - exit $(MERLIN_RESTART) - restart: | - echo "bye, mom!" >> my_file.txt - max_retries: 23 - retry_delay: 10 - - * - ``$(MERLIN_RETRY)`` - - Retry this step's ``cmd`` command. The default maximum number of retries for any given step - is 30. You can override this by adding a ``max_retries`` field under the run - field in the specification. Issues a warning. Default will retry in 1 second. To override - the delay time, specify retry_delay. - - :: - - run: - cmd: | - touch my_file.txt - echo "hi mom!" >> my_file.txt - exit $(MERLIN_RETRY) - max_retries: 23 - retry_delay: 10 - - * - ``$(MERLIN_SOFT_FAIL)`` - - Mark this step as a failure, note in the warning log but keep going. - Unknown return codes get translated to soft fails, so that they can - be logged. - - - :: - - echo "Uh-oh, this sample didn't work" - exit $(MERLIN_SOFT_FAIL) - - * - ``$(MERLIN_HARD_FAIL)`` - - Something went terribly wrong and I need to stop the whole workflow. - Raises a ``HardFailException`` and stops all workers connected to that - step. Workers will stop after a 60 second delay to allow the step to - be acknowledged by the server. - - .. note:: - Workers in isolated parts of the - workflow not consuming from the bad step will continue. You can stop - all workers with ``$(MERLIN_STOP_WORKERS)``. - - - - :: - - echo "Oh no, we've created skynet! Abort!" - exit $(MERLIN_HARD_FAIL) - - * - ``$(MERLIN_STOP_WORKERS)`` - - Launch a task to stop all active workers. To allow the current task to - finish and acknowledge the results to the server, will happen in 60 - seconds. - - - :: - - # send a signal to all workers to stop - exit $(MERLIN_STOP_WORKERS) diff --git a/docs/source/merlin_workflows.rst b/docs/source/merlin_workflows.rst deleted file mode 100644 index 42cb7a39a..000000000 --- a/docs/source/merlin_workflows.rst +++ /dev/null @@ -1,49 +0,0 @@ -Workflows -========= - -The Merlin package provides a few example workflows. These may be useful in -seeing how the software works, and in designing your own workflow. This section -provides documentation on running these Merlin workflow examples. - -Overview --------- - -List the built-in Merlin workflow examples with ``merlin example list``. - -The Merlin team is working on adding a more diverse array of example workflows -like these. - -In particular, look at the ``.yaml`` files within these directories. These -are known as Merlin specifications, and are foundational to determining a workflow. - - -Get started with the demo ensemble ------------------------------------ - -Merlin provides a demo workflow that highlights some features of the software. - -.. tip:: - - Have at least two terminals open; one to monitor workers, and the other to - provide them tasks. - -Create your workflow example: - -.. code:: bash - - $ merlin example feature_demo - -To run the distributed version of ``feature_demo``, run the following: - -.. code:: bash - - $ merlin run feature_demo/feature_demo.yaml - -This will queue the tasks to the configured broker. To process the queued -tasks, use the ``run-workers`` Merlin CLI command. Adding this command -to a parallel batch submission script will launch the workers in parallel. - -.. code:: bash - - $ merlin run-workers feature_demo/feature_demo.yaml - diff --git a/docs/source/modules/advanced_topics/advanced_requirements.txt b/docs/source/modules/advanced_topics/advanced_requirements.txt deleted file mode 100644 index b6894d754..000000000 --- a/docs/source/modules/advanced_topics/advanced_requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -merlin -pandas -fakers diff --git a/docs/source/modules/advanced_topics/advanced_topics.rst b/docs/source/modules/advanced_topics/advanced_topics.rst deleted file mode 100644 index 65f3c41bf..000000000 --- a/docs/source/modules/advanced_topics/advanced_topics.rst +++ /dev/null @@ -1,415 +0,0 @@ -Advanced Topics -=============== -.. admonition:: Prerequisites - - * :doc:`Module 2: Installation<../installation/installation>` - * :doc:`Module 3: Hello World<../hello_world/hello_world>` - * :doc:`Module 4: Running a Real Simulation<../run_simulation/run_simulation>` - * Python virtual environment containing the following packages - - * merlin - * pandas - * faker - * maestrowf - -.. admonition:: Estimated time - - * 15 minutes - -.. admonition:: You will learn - - * Run workflows using HPC batch schedulers - * Distribute workflows across multiple batch allocations and machines - * Setup iterative workflow specs suited for optimization and dynamic sampling applications - -.. contents:: Table of Contents: - :local: - -Setup -+++++ - -The code for the following examples can be obtained from command line, invoking: - -.. code-block:: bash - - merlin example hpc_demo - -This will copy the three merlin workflow specifications from this section and the supporting -python scripts. Each specification may need some modification to adapt it to the batch -scheduler you will be using. In addition, the dynamic sampling workflow will need an -additional modification to set the path of the virtual environment, which is set as a variable -in the ``env`` block. - -Interfacing with HPC systems -++++++++++++++++++++++++++++ - -Another block is added to the merlin workflow specification when running on HPC systems, -the ``batch`` block. This block contains information about the batch scheduler system such -as batch type, batch queue to use, and banks to charge to. There are additional optional -arguments for addressing any special configurations or launch command arguments, varying -based on batch type. In addition, the shell type used by each steps ``cmd`` scripts can -be specified here. The number of nodes in a batch allocation can be defined here, but it -will be overridden in the worker config. - -.. code-block:: yaml - - batch: - # Required keys: - - type: flux - bank: testbank - queue: pbatch - - # Optional keys: - flux_path: - flux_start_opts: - flux_exec_workers: - - launch_pre: - launch_args: - worker_launch: - - shell: - # e.g. /bin/bash, /bin/tcsh, python, /usr/bin/env perl, etc. - nodes: # The number of nodes to use for all workers - This can be overridden in the workers config. - If this is unset the number of nodes will be - queried from the environment, failing that, the - number of nodes will be set to 1. - - -.. NOTE FOR CODE MEETING: what is this data management comment in the merlin block header - in the commented specification file? - -.. NOTE FOR CODE MEETING: why not use task queue name in worker resources blocks instead - of step names -> nominally seem to be different terms for the intended functionality - -.. NOTE FOR ME: test out monitor step type from example spec -> anything interesting - to do here? can this be started first on the command line to enable an actual monitor - process? - -Inside the study step specifications are a few additional keys that become more useful -on HPC systems: nodes, procs, and task_queue. Adding on the actual study steps to the -above batch block specifies the actual resources each steps processes will take. - -.. code-block:: yaml - - study: - - name: sim-runs - description: Run simulations - run: - cmd: $(LAUNCHER) echo "$(VAR1) $(VAR2)" > simrun.out - nodes: 4 - procs: 144 - task_queue: sim_queue - - - name: post-process - description: Post-Process simulations on second allocation - run: - cmd: | - cd $(runs1.workspace)/$(MERLIN_SAMPLE_PATH) - $(LAUNCHER) - nodes: 1 - procs: 36 - depends: [sim-runs] - task_queue: post_proc_queue - -.. NOTE FOR ME TO TRY: run various post proc scripts, both with concurrent futures - and mpi4py executors to demo the different calls -> $(LAUNCHER) likely not appropriate here - -In addition to the ``batch`` block is the ``resources`` section inside the ``merlin`` block. -This can be used to put together custom celery workers. Here you can override batch -types and node counts on a per worker basis to accommodate steps with different -resource requirements. In addition, this is where the ``task_queue`` becomes useful, as -it groups the different allocation types, which can be assigned to each worker here -by specifying step names. - -.. Q: why not specify queue instead of step names here? -.. A: idk, that's how it's always been. Maybe as a way of hiding queues under the abstraction of steps? - -.. code-block::yaml - - merlin: - - resources: - task_server: celery - - # Flag to determine if multiple workers can pull tasks - # from overlapping queues. (default = False) - overlap: False - - # Customize workers. Workers can have any user-defined name - # (e.g., simworkers, learnworkers, ...) - workers: - simworkers: - args: # - steps: [sim-runs] # [all] if none specified - nodes: 4 # optional - machines: [host1] # - -Arguments to celery itself can also be defined here with the ``args`` key. Of particular -interest will be: - -=========================== =============== -``--concurrency`` - -``--prefetch-multiplier`` - -``-0 fair`` -=========================== =============== - -Concurrency can be used to run multiple workers in an allocation, thus is recommended to be -set to the number of simulations or step work items that fit into the number of nodes in the -batch allocation in which these workers are spawned. Note that some schedulers, such as -``flux``, can support more jobs than the node has resources for. This may not impact the -throughput, but it can prevent over-subscription errors that might otherwise stop the workflow. - -The prefetch multiplier is more related to packing in tasks into the time of the allocation. -For long running tasks it is recommended to set this to 1. For short running tasks, this -can reduce overhead from talking to the rabbit servers by requesting `` x `` -tasks at a time from the server. - -The ``-0 fair`` option enables workers running tasks from different queues to run on the same -allocation. - -.. NOTE: while there is a warning/info message about this, this workflow seems to work - just fine without this option. Is this only needed for simultaneous running, as opposed - to dependent steps in different queues? - -The example block below extends the previous with workers configured for long running -simulation jobs as well as shorter running post processing tasks that can cohabit an allocation - - -.. code-block:: yaml - - merlin: - - resources: - task_server: celery - - overlap: False - - # Customize workers - workers: - simworkers: - args: --concurrency 1 - steps: [sim-runs] - nodes: 4 - machines: [host1] - - postworkers: - args: --concurrency 4 --prefetch-multiplier 2 - steps: [post-proc-runs] - nodes: 1 - machines: [host1] - - -.. - NOTE FOR CODE MEETING/ME TO TRY: nodes, either in batch or workers, behaves differently from - maestro, meaning it's meant to be nodes per step instantiation, not batch allocation size.. - - NOTE FOR CODE MEETING: clarify what overlap key does if turned on. Just multiple named workers - pulling from same queues? is this a requirement for making it work cross machine? - Also: what about procs per worker instead of just nodes? - -Putting it all together with the parameter blocks we have an HPC batch enabled study specification. -In this demo workflow, ``sample_names`` generates one many single core jobs, with concurrency -set to 36 for this particular machine that has 36 cores per node. The ``collect`` step on the -other hand consists of a single job that uses all cores on the node, and is assigned to a queue -that has a concurrency of 1. - -.. code-block:: yaml - - description: - name: hpc_demo - description: Demo running a workflow on HPC machines - - env: - variables: - OUTPUT_PATH: ./name_studies - - # Collect individual sample files into one for further processing - COLLECT: $(SPECROOT)/sample_collector.py - - # Process single iterations' results - POST_PROC: $(SPECROOT)/sample_processor.py - - # Process all iterations - CUM_POST_PROC: $(SPECROOT)/cumulative_sample_processor.py - - # Number of threads for post proc scripts - POST_NPROCS: 36 - PYTHON: - - batch: - type: flux - bank: testbank - queue: pdebug - shell: /bin/bash - nodes: 1 - - ######################################## - # Study definition - ######################################## - study: - - name: sample_names - description: Record samples from the random name generator - run: - cmd: | - $(LAUNCHER) echo "$(NAME)" - $(LAUNCHER) echo "$(NAME)" > name_sample.out - nodes: 1 - procs: 1 - task_queue: name_queue - - - name: collect - description: Collect all samples generated - run: - cmd: | - echo $(MERLIN_GLOB_PATH) - echo $(sample_names.workspace) - - ls $(sample_names.workspace)/$(MERLIN_GLOB_PATH)/name_sample.out | xargs $(PYTHON) $(COLLECT) -out collected_samples.txt --np $(POST_NPROCS) - - nodes: 1 - procs: 1 - depends: [sample_names_*] - task_queue: post_proc_queue - - - name: post-process - description: Post-Process collection of samples, counting occurrences of unique names - run: - cmd: | - $(PYTHON) $(POST_PROC) $(collect.workspace)/collected_samples.txt --results iter_$(ITER)_results.json - - nodes: 1 - procs: 1 - depends: [collect] - task_queue: post_proc_queue - - ######################################## - # Worker and sample configuration - ######################################## - merlin: - - resources: - task_server: celery - - overlap: False - - workers: - nameworkers: - args: --concurrency 36 --prefetch-multiplier 3 - steps: [sample_names] - nodes: 1 - machines: [borax, quartz] - - postworkers: - args: --concurrency 1 --prefetch-multiplier 1 - steps: [post-process] - nodes: 1 - machines: [borax, quartz] - - ################################################### - samples: - column_labels: [NAME] - file: $(MERLIN_INFO)/samples.csv - generate: - cmd: | - $(PYTHON) $(SPECROOT)/faker_sample.py -n 200 -outfile=$(MERLIN_INFO)/samples.csv - - -.. - NOTES: encode virtual envs in the spec/workflow: only the first call to merlin run will - get the host venv, subsequent ones - - RECURSIVE WORKFLOWS: if exit condition isn't working, terminating workers can be difficult - - have another shell open at least to purge the queues and stop the workers - -Multi-machine workflows -+++++++++++++++++++++++ - -Spreading this workflow across multiple machines is a simple modification of the above workflow: -simply add additional host names to the `machines` list in the worker config. A caveat for this -feature is that all host systems will need to have access to the same workspace/filesystem. -The following resource block demonstrates -usage of one host for larger simulation steps, and a second host for the smaller post processing -steps. In this case you simply need an alloc on each host, and can simply execute ``run-workers`` -on each, with ``run`` only needed once up front to send the tasks to the queue server. - -.. TODO: do all host systems need to use the same scheduler type? - -.. code-block::yaml - - ######################################## - # Worker and sample configuration - ######################################## - merlin: - - resources: - task_server: celery - - overlap: False - - # Customize workers - workers: - simworkers: - args: --concurrency 1 - steps: [sim-runs] - nodes: 4 - machines: [host1] - - postworkers: - args: --concurrency 4 --prefetch-multiplier 2 - steps: [post-proc-runs] - nodes: 1 - machines: [host2] - - -Dynamic task queuing and sampling -++++++++++++++++++++++++++++++++++ - -Iterative workflows, such as optimization or machine learning, can be implemented -in merlin via recursive workflow specifications that use dynamic task queuing. -The example spec below is a simple implementation of this using an iteration counter -``$(ITER)`` and a predetermined limit, ``$(MAX_ITER)`` to limit the number of times -to generate new samples and spawn a new instantiation of the workflow. The iteration -counter takes advantage of the ability to override workflow variables on the command line. - -.. literalinclude:: ./faker_demo.yaml - :language: yaml - -.. TODO: add venv building to the spec? -> can this be dependency - add file system cleanup after each iteration - parallelize the iteration accumulation in cumulative_sample_proc - - -This workflow specification is intended to be invoke within an allocation of nodes on your -HPC cluster, e.g. within and sxterm. The last step to queue up new samples for the next iteration, -``merlin run faker_demo.yaml ...``, only doesn't need to also call ``run-workers`` since -the workers from the first instantiation are still alive. Thus the new samples will -immediately start processing on the existing allocation. - -Another change in this workflow relative to the single stage version is managing the workspaces -and outputs. The strategy used here is to create a new directory for collecting each iterations -final outputs, ``$(ITER_OUTPUT)``, facilitating collective post processing at the end without -having to worry about traversing into each iterations' local workspaces. - -The workflow itself isn't doing anything practical; it's simply repeatedly sampling from -a fake name generator in an attempt to count the number of unique names that are possible. -The figure below shows results from running 20 iterations, with the number of unique names -faker can generate appearing to be slightly more than 300. - -.. image:: ./cumulative_results.png - -.. Bootstrapping distributed workflows - +++++++++++++++++++++++++++++++++++ - There is an alternative to the manual in-allocation workflow instantiation used in the previous - examples: encode the run-workers calls into batch scripts and submit those, or use a tool such - as Maestro to write those batch scripts and manage the allocations and worker startup. This can - particularly useful for large studies that can't fit into single allocations, or even to split them - up into smaller allocations to get through the batch queues more quickly. Here's an example of - using maestro to do spin up a multi allocation instantiation of the dynamic demo: - .. literalinclude:: ./advanced_topics/maestro_distributed.yaml - :language: yaml diff --git a/docs/source/modules/advanced_topics/faker_demo.yaml b/docs/source/modules/advanced_topics/faker_demo.yaml deleted file mode 100644 index 251f909e4..000000000 --- a/docs/source/modules/advanced_topics/faker_demo.yaml +++ /dev/null @@ -1,116 +0,0 @@ - description: - name: dynamic_sampling_demo - description: Demo dynamic sampling workflow - - env: - variables: - OUTPUT_PATH: ./name_studies - ITER_OUTPUT: $(SPECROOT)/$(OUTPUT_PATH)/iter_outputs # Iteration and cumulative results - COLLECT: $(SPECROOT)/sample_collector.py - POST_PROC: $(SPECROOT)/sample_processor.py # Process single iterations' results - CUM_POST_PROC: $(SPECROOT)/cumulative_sample_processor.py # Process all iterations - POST_NPROCS: 36 # Number of threads for post proc scripts - PYTHON: /usr/WS2/white242/merlin_dev_2/venv_merlin_py3_7/bin/python - ITER: 1 - MAX_ITER: 10 - - batch: - type: flux - bank: testbank - queue: pdebug - shell: /bin/bash - nodes: 1 - - ######################################## - # Study definition - ######################################## - study: - - name: sample_names - description: Record samples from the random name generator - run: - cmd: | - $(LAUNCHER) echo "$(NAME)" - $(LAUNCHER) echo "$(NAME)" > name_sample.out - nodes: 1 - procs: 1 - task_queue: name_queue - - - name: collect - description: Collect all samples generated - run: - cmd: | - echo $(MERLIN_GLOB_PATH) - echo $(sample_names.workspace) - - ls $(sample_names.workspace)/$(MERLIN_GLOB_PATH)/name_sample.out | xargs $(PYTHON) $(COLLECT) -out collected_samples.txt --np $(POST_NPROCS) - - nodes: 1 - procs: 1 - depends: [sample_names_*] - task_queue: post_proc_queue - - - name: post-process - description: Post-Process collection of samples, counting occurrences of unique names - run: - cmd: | - $(PYTHON) $(POST_PROC) $(collect.workspace)/collected_samples.txt --results $(ITER_OUTPUT)/iter_$(ITER)_results.json - - nodes: 1 - procs: 1 - depends: [collect] - task_queue: post_proc_queue - - - name: run-more-samples - description: Generate new set of samples and rerun, or generate some descriptive plots/statistics - run: - cmd: | - if [ $(ITER) -ge $(MAX_ITER) ] ; then - echo "done" - $(PYTHON) $(CUM_POST_PROC) $(ITER_OUTPUT)/iter_*_results.json --np $(POST_NPROCS) --hardcopy $(ITER_OUTPUT)/cumulative_results.png - else - next_iter=$(ITER) - ((next_iter=next_iter+1)) - echo "Starting iteration " $next_iter - cd $(SPECROOT) - merlin run $(SPECROOT)/faker_demo.yaml --vars ITER=$next_iter - fi - nodes: 1 - procs: 1 - depends: [post-process] - task_queue: post_proc_queue - - - ######################################## - # Worker and sample configuration - ######################################## - merlin: - - resources: - task_server: celery - - overlap: False - - # Customize workers NOTE: abuse this for scaling study: prefetch mult increase - # - celery->rabbit query overhead for fast jobs - workers: - nameworkers: - args: --concurrency 36 --prefetch-multiplier 3 - steps: [sample_names] - nodes: 1 - machines: [borax, quartz] - - # NOTE: specifying wrong step leaves orphaned queue -> purge first! - # also, invalid host name appears to fail silently - postworkers: - args: --concurrency 1 --prefetch-multiplier 1 - steps: [post-process] - nodes: 1 - machines: [borax, quartz] - - ################################################### - samples: - column_labels: [NAME] - file: $(MERLIN_INFO)/samples.csv - generate: - cmd: | - $(PYTHON) $(SPECROOT)/faker_sample.py -n 200 -outfile=$(MERLIN_INFO)/samples.csv diff --git a/docs/source/modules/before.rst b/docs/source/modules/before.rst deleted file mode 100644 index dab1c8e2c..000000000 --- a/docs/source/modules/before.rst +++ /dev/null @@ -1,47 +0,0 @@ -0. Before you start -=================== - -It will be helpful to have these steps already completed before you -start the tutorial modules: - -* Make sure you have `python 3.6`__ or newer. - -__ https://www.python.org/downloads/release/python-360/ - -* Make sure you have `pip`__ version 22.3 or newer. - -__ https://www.pypi.org/project/pip/ - - * You can upgrade pip to the latest version with: - - .. code-block:: bash - - pip install --upgrade pip - - * OR you can upgrade to a specific version with: - - .. code-block:: bash - - pip install --upgrade pip==x.y.z - - -* Make sure you have `GNU make tools`__ and `compilers`__. - -__ https://www.gnu.org/software/make/ -__ https://gcc.gnu.org/ - -* (OPTIONAL) Install `docker`__. - -__ https://docs.docker.com/install/ - - * Download OpenFOAM image with: - - .. code-block:: bash - - docker pull cfdengine/openfoam - - * Download redis image with: - - .. code-block:: bash - - docker pull redis diff --git a/docs/source/modules/contribute.rst b/docs/source/modules/contribute.rst deleted file mode 100644 index acf35d323..000000000 --- a/docs/source/modules/contribute.rst +++ /dev/null @@ -1,48 +0,0 @@ -Contribute to Merlin -==================== -.. admonition:: Estimated time - - * 10 minutes - -.. admonition:: You will learn - - * How to post issues to the merlin repo. - * Guidelines for contributing to merlin. - -.. contents:: Table of Contents: - :local: - -Issues -++++++ -Found a bug? Have an idea? Or just want to ask a question? -`Create a new issue `_ on GitHub. - -Bug Reports ------------ -To report a bug, simply navigate to `Issues `_, click "New Issue", then click "Bug report". Then simply fill out a few fields such as "Describe the bug" and "Expected behavior". Try to fill out every field as it will help us figure out your bug sooner. - -Feature Requests ----------------- -We are still adding new features to merlin. To suggest one, simply navigate to `Issues `_, click "New Issue", then click "Feature request". Then fill out a few fields such as "What problem is this feature looking to solve?" - -Questions ---------- -.. note:: - - Who knows? Your question may already be answered in the :doc:`FAQ<../faq>`. - -We encourage questions to be asked in a collaborative setting: on GitHub, direct any questions to `General Questions `_ in Issues. - -Any questions can also be sent to merlin@llnl.gov. - -Contributing -++++++++++++ -Merlin is an open source project, so contributions are welcome. Contributions can be anything from bugfixes, documentation, or even new core features. - -Contributing to Merlin is easy! Just `send us a pull request `_ from your fork. Before you send it, summarize your change in the ``[Unreleased]`` section of ``CHANGELOG.md`` and make sure develop is the destination branch. We also appreciate `squash commits `_ before pull requests are merged. - -Merlin uses a rough approximation of the Git Flow branching model. The develop branch contains the latest contributions, and main is always tagged and points to the latest stable release. - -If you're a contributor, try to test and run on develop. That's where all the magic is happening (and where we hope bugs stop). - -More detailed information on contributing can be found on the :doc:`Contributing page<../merlin_developer>`. diff --git a/docs/source/modules/hello_world/.gitignore b/docs/source/modules/hello_world/.gitignore deleted file mode 100644 index cadc713ee..000000000 --- a/docs/source/modules/hello_world/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.graphml diff --git a/docs/source/modules/hello_world/celery.txt b/docs/source/modules/hello_world/celery.txt deleted file mode 100644 index 595e4fec7..000000000 --- a/docs/source/modules/hello_world/celery.txt +++ /dev/null @@ -1,27 +0,0 @@ - -------------- celery@worker_name.%machine770 v4.4.0 (cliffs) ---- ***** ----- --- ******* ---- Linux-3.10.0-1062.9.1.1chaos.ch6.x86_64-x86_64-with-redhat-7.7-Maipo 2020-02-12 09:53:10 -- *** --- * --- -- ** ---------- [config] -- ** ---------- .> app: merlin:0x2aaab20619e8 -- ** ---------- .> transport: amqps://user:**@server:5671//user -- ** ---------- .> results: redis://user:**@server:6379/0 -- *** --- * --- .> concurrency: 36 (prefork) --- ******* ---- .> task events: OFF (enable -E to monitor tasks in this worker) ---- ***** ----- - -------------- [queues] - .> merlin exchange=merlin(direct) key=merlin - - -[tasks] - . merlin.common.tasks.add_merlin_expanded_chain_to_chord - . merlin.common.tasks.expand_tasks_with_samples - . merlin.common.tasks.merlin_step - . merlin:chordfinisher - . merlin:queue_merlin_study - -[2020-02-12 09:53:11,549: INFO] Connected to amqps://user:**@server:5671//user -[2020-02-12 09:53:11,599: INFO] mingle: searching for neighbors -[2020-02-12 09:53:12,807: INFO] mingle: sync with 2 nodes -[2020-02-12 09:53:12,807: INFO] mingle: sync complete -[2020-02-12 09:53:12,835: INFO] celery@worker_name.%machine770 ready. diff --git a/docs/source/modules/hello_world/hello.yaml b/docs/source/modules/hello_world/hello.yaml deleted file mode 100644 index b4107d595..000000000 --- a/docs/source/modules/hello_world/hello.yaml +++ /dev/null @@ -1,24 +0,0 @@ -description: - name: hello - description: a very simple merlin workflow - -global.parameters: - GREET: - values : ["hello","hola"] - label : GREET.%% - WORLD: - values : ["world","mundo"] - label : WORLD.%% - -study: - - name: step_1 - description: say hello - run: - cmd: echo "$(GREET), $(WORLD)!" - - - name: step_2 - description: print a success message - run: - cmd: print("Hurrah, we did it!") - depends: [step_1_*] - shell: /usr/bin/env python3 diff --git a/docs/source/modules/hello_world/hello_world.rst b/docs/source/modules/hello_world/hello_world.rst deleted file mode 100644 index 2cec6f05c..000000000 --- a/docs/source/modules/hello_world/hello_world.rst +++ /dev/null @@ -1,353 +0,0 @@ -Hello, World! -============= -This hands-on module walks through the steps of building and running a simple merlin workflow. - -.. admonition:: Prerequisites - - * :doc:`Module 2: Installation<../installation/installation>` - -.. admonition:: Estimated time - - * 30 minutes - -.. admonition:: You will learn - - * The components of a merlin workflow specification. - * How to run a simple merlin workflow. - * How to interpret the results of your workflow. - - -.. contents:: Table of Contents: - :local: - -Get Example Files -+++++++++++++++++ -``merlin example`` is a command line tool that makes it easy to get a basic workflow up and running. To see a list of all the examples provided with merlin you can run: - -.. code-block:: bash - - $ merlin example list - -For this tutorial we will be using the ``hello`` example. Run the following commands: - -.. code-block:: bash - - $ merlin example hello - -.. code-block:: bash - - $ cd hello/ - -This will create and move into directory called ``hello``, which contains these files: - -* ``my_hello.yaml`` -- this spec file is partially blank. You will fill in the gaps as you follow this module's steps. - -* ``hello.yaml`` -- this is a complete spec without samples. You can always reference it as an example. - -* ``hello_samples.yaml`` -- same as before, but with samples added. - -* ``make_samples.py`` -- this is a small python script that generates samples. - -* ``requirements.txt`` -- this is a text file listing this workflow's python dependencies. - -Specification File -++++++++++++++++++ - -Central to Merlin is something called a specification file, or a "spec" for short. -The spec defines all aspects of your workflow. -The spec is formatted in yaml. -If you're unfamiliar with yaml, it's worth `reading up on`__ for a few minutes. - -__ https://www.tutorialspoint.com/yaml/yaml_quick_guide.htm - -.. warning:: - - Stray whitespace can break yaml; make sure your indentation is consistent. - -Let's build our spec piece by piece. For each spec section listed below, fill in the blank yaml entries of ``my_hello.yaml`` with the given material. - -Section: ``description`` -~~~~~~~~~~~~~~~~~~~~~~~~ -Just what it sounds like. Name and briefly summarize your workflow. - -.. code-block:: yaml - - description: - name: hello world workflow - description: say hello in 2 languages - -Section: ``global.parameters`` -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. better explanation?? - -Global parameters are constants that you want to vary across simulations. -Steps that contain a global parameter or depend on other steps that contain a global parameter are run for each index over parameter values. -The label is the pattern for a filename that will be created for each value. - -.. code-block:: yaml - - global.parameters: - GREET: - values : ["hello","hola"] - label : GREET.%% - WORLD: - values : ["world","mundo"] - label : WORLD.%% - -.. note:: - - ``%%`` is a special token that defines where the value in the label is placed. In this case the parameter labels will be GREET.hello, GREET.hola, etc. The label can take a custom text format, so long as the ``%%`` token is included to be able to substitute the parameter’s value in the appropriate place. - -So this will give us 1) an English result, and 2) a Spanish one (you could add as many more languages as you want, as long as both parameters hold the same number of values). - -Section: ``study`` -~~~~~~~~~~~~~~~~~~ -This is where you define workflow steps. -While the convention is to list steps as sequentially as possible, the only factor in determining step order is the dependency directed acyclic graph (DAG) created by the ``depends`` field. - -.. code-block:: yaml - - study: - - name: step_1 - description: say hello - run: - cmd: echo "$(GREET), $(WORLD)!" - - - name: step_2 - description: print a success message - run: - cmd: print("Hurrah, we did it!") - depends: [step_1] - shell: /usr/bin/env python3 - -.. note:: - - The ``-`` denotes a list item in YAML. To add elements, simply add new elements prefixed with a hyphen - -``$(GREET)`` and ``$(WORLD)`` expand the global parameters separately into their two values. -.. ``$(step_1.workspace)`` gets the path to ``step_1``. -The default value for ``shell`` is ``/bin/bash``. In ``step_2`` we override this to use python instead. -Steps must be defined as nodes in a DAG, so no cyclical dependencies are allowed. -Our step DAG currently looks like this: - -.. image:: dag1.png - :width: 100 - :align: center - -Since our global parameters have 2 values, this is actually what the DAG looks like: - -.. image:: dag2.png - :width: 300 - :align: center - -It looks like running ``step_2`` twice is redundant. Instead of doing that, we can collapse it back into a single step, by having it wait for both parameterized versions of ``step_1`` to finish. Add ``_*`` to the end of the step name in ``step_1``'s depend entry. Go from this: - -.. code-block:: yaml - - depends: [step_1] - -...to this: - -.. code-block:: yaml - - depends: [step_1_*] - -Now the DAG looks like this: - -.. image:: dag3.png - :width: 300 - :align: center - -Your full hello world spec ``my_hello.yaml`` should now look like this (an exact match of ``hello.yaml``): - -.. literalinclude:: ../../../../merlin/examples/workflows/hello/hello.yaml - :language: yaml - -The order of the spec sections doesn't matter. - -.. note:: - - At this point, ``my_hello.yaml`` is still maestro-compatible. The primary difference is that maestro won't understand anything in the ``merlin`` block, which we will still add later. If you want to try it, run: ``$ maestro run my_hello.yaml`` - -Try It! -+++++++ - -First, we'll run merlin locally. On the command line, run: - -.. code-block:: bash - - $ merlin run --local my_hello.yaml - -If your spec is bugless, you should see a few messages proclaiming successful step completion, like this (for now we'll ignore the warning): - -.. literalinclude :: local_out.txt - :language: text - -Great! But what happened? We can inspect the output directory to find out. - -Look for a directory named ``hello_``. That's your output directory. -Within, there should be a directory for each step of the workflow, plus one called ``merlin_info``. -The whole file tree looks like this: - -.. image:: merlin_output.png - :align: center - -A lot of stuff, right? Here's what it means: - -* The 3 yaml files inside ``merlin_info/`` are called the provenance specs. They are copies of the original spec that was run, some showing under-the-hood variable expansions. - -* ``MERLIN_FINISHED`` files indicate that the step ran successfully. - -* ``.sh`` files contain the command for the step. - -* ``.out`` files contain the step's stdout. Look at one of these, and it should contain your "hello" message. - -* ``.err`` files contain the step's stderr. Hopefully empty, and useful for debugging. - -.. Assuming config is ready - -Run Distributed! -++++++++++++++++ - -.. important:: - - Before trying this, make sure you've properly set up your merlin config file ``app.yaml``. Run ``$ merlin info`` for information on your merlin configuration. - -Now we will run the same workflow, but in parallel on our task server: - -.. code-block:: bash - - $ merlin run my_hello.yaml - -If your merlin configuration is set up correctly, you should see something like this: - -.. literalinclude :: run_out.txt - :language: text - -That means we have launched our tasks! Now we need to launch the workers that will complete those tasks. Run this: - -.. code-block:: bash - - $ merlin run-workers my_hello.yaml - -Here's the expected merlin output message for running workers: - -.. literalinclude :: run_workers_out.txt - :language: text - -Immediately after that, this will pop up: - -.. literalinclude :: celery.txt - :language: text - -You may not see all of the info logs listed after the Celery C is displayed. If you'd like to see them you can change the merlin workers' log levels with the ``--worker-args`` tag: - -.. code-block:: bash - - $ merlin run-workers --worker-args "-l INFO" my_hello.yaml - -The terminal you ran workers in is now being taken over by Celery, the powerful task queue library that merlin uses internally. The workers will continue to report their task status here until their tasks are complete. - -Workers are persistent, even after work is done. Send a stop signal to all your workers with this command: - -.. code-block:: bash - - $ merlin stop-workers - -...and a successful worker stop will look like this, with the name of specific worker(s) reported: - -.. literalinclude :: stop_workers.txt - :language: text - -.. _Using Samples: - -Using Samples -+++++++++++++ -It's a little boring to say "hello world" in just two different ways. Let's instead say hello to many people! - -To do this, we'll need samples. Specifically, we'll change ``WORLD`` from a global parameter to a sample. While parameters are static, samples are generated dynamically, and can be more complex data types. In this case, ``WORLD`` will go from being "world" or "mundo" to being a randomly-generated name. - -First, we remove the global parameter ``WORLD`` so it does not conflict with our new sample. Parameters now look like this: - -.. code-block:: yaml - - global.parameters: - GREET: - values : ["hello", "hola"] - label : GREET.%% - -Now add these yaml sections to your spec: - -.. code-block:: yaml - - env: - variables: - N_SAMPLES: 3 - -This makes ``N_SAMPLES`` into a user-defined variable that you can use elsewhere in your spec. - -.. code-block:: yaml - - merlin: - samples: - generate: - cmd: python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) - file: $(MERLIN_INFO)/samples.csv - column_labels: [WORLD] - -This is the merlin block, an exclusively merlin feature. It provides a way to generate samples for your workflow. In this case, a sample is the name of a person. - -For simplicity we give ``column_labels`` the name ``WORLD``, just like before. - -It's also important to note that ``$(SPECROOT)`` and ``$(MERLIN_INFO)`` are reserved variables. The ``$(SPECROOT)`` variable is a shorthand for the directory path of the spec file and the ``$(MERLIN_INFO)`` variable is a shorthand for the directory holding the provenance specs and sample generation results. More information on Merlin variables can be found on the :doc:`variables page<../../merlin_variables>`. - -It's good practice to shift larger chunks of code to external scripts. At the same location of your spec, make a new file called ``make_samples.py``: - -.. literalinclude :: ../../../../merlin/examples/workflows/hello/make_samples.py - :language: text - -Since our environment variable ``N_SAMPLES`` is set to 3, this sample-generating command should churn out 3 different names. - -Before we can run this, we must install the script's external python library dependencies (``names``: a simple package that generates random names, and ``numpy``: a scientific computing package): - -.. code-block:: bash - - $ pip3 install -r requirements.txt - -Here's our DAG with samples: - -.. image:: dag4.png - :width: 400 - :align: center - -Here's your new and improved ``my_hello.yaml``, which now should match ``hello_samples.yaml``: - -.. literalinclude:: ../../../../merlin/examples/workflows/hello/hello_samples.yaml - :language: yaml - -Run the workflow again! - -Once finished, this is what the insides of ``step_1`` look like: - -.. image:: merlin_output2.png - :align: center - -* Numerically-named directories like ``00``, ``01``, and ``02`` are sample directories. Instead of storing sample output in a single flattened location, merlin stores them in a tree-like sample index, which helps get around file system constraints when working with massive amounts of data. - -Lastly, let's flex merlin's muscle a bit and scale up our workflow to 1000 samples. To do this, you could internally change the value in the spec from 3 to 1000. OR you could just run this: - -.. code-block:: bash - - $ merlin run my_hello.yaml --vars N_SAMPLES=1000 - -.. code-block:: bash - - $ merlin run-workers my_hello.yaml - -Once again, to send a warm stop signal to your workers, run: - -.. code-block:: bash - - $ merlin stop-workers - -Congratulations! You concurrently greeted 1000 friends in English and Spanish! diff --git a/docs/source/modules/hello_world/local_out.txt b/docs/source/modules/hello_world/local_out.txt deleted file mode 100644 index 1184531c6..000000000 --- a/docs/source/modules/hello_world/local_out.txt +++ /dev/null @@ -1,31 +0,0 @@ - - - * - *~~~~~ - *~~*~~~* __ __ _ _ - / ~~~~~ | \/ | | (_) - ~~~~~ | \ / | ___ _ __| |_ _ __ - ~~~~~* | |\/| |/ _ \ '__| | | '_ \ - *~~~~~~~ | | | | __/ | | | | | | | - ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| - *~~~~~~~~~~~ - ~~~*~~~* Machine Learning for HPC Workflows - - - -[2020-02-07 09:35:49: WARNING] Workflow specification missing - encouraged 'merlin' section! Run 'merlin example' for examples. -Using default configuration with no sampling. -[2020-02-07 09:35:49: INFO] Study workspace is 'hello_20200207-093549'. -[2020-02-07 09:35:49: INFO] Reading app config from file ~/.merlin/app.yaml -[2020-02-07 09:35:49: INFO] Calculating task groupings from DAG. -[2020-02-07 09:35:49: INFO] Converting graph to celery tasks. -[2020-02-07 09:35:49: INFO] Launching tasks. -[2020-02-07 09:35:49: INFO] Executing step 'step1_HELLO.hello' in 'hello_20200207-093549/step1/HELLO.hello'... -[2020-02-07 09:35:54: INFO] Step 'step1_HELLO.hello' in 'hello_20200207-093549/step1/HELLO.hello' finished successfully. -[2020-02-07 09:35:54: INFO] Executing step 'step2_HELLO.hello' in 'hello_20200207-093549/step2/HELLO.hello'... -[2020-02-07 09:35:59: INFO] Step 'step2_HELLO.hello' in 'hello_20200207-093549/step2/HELLO.hello' finished successfully. -[2020-02-07 09:35:59: INFO] Executing step 'step1_HELLO.hola' in 'hello_20200207-093549/step1/HELLO.hola'... -[2020-02-07 09:36:04: INFO] Step 'step1_HELLO.hola' in 'hello_20200207-093549/step1/HELLO.hola' finished successfully. -[2020-02-07 09:36:04: INFO] Executing step 'step2_HELLO.hola' in 'hello_20200207-093549/step2/HELLO.hola'... -[2020-02-07 09:36:09: INFO] Step 'step2_HELLO.hola' in 'hello_20200207-093549/step2/HELLO.hola' finished successfully. diff --git a/docs/source/modules/hello_world/run_out.txt b/docs/source/modules/hello_world/run_out.txt deleted file mode 100644 index 3325d5cd4..000000000 --- a/docs/source/modules/hello_world/run_out.txt +++ /dev/null @@ -1,25 +0,0 @@ - - - * - *~~~~~ - *~~*~~~* __ __ _ _ - / ~~~~~ | \/ | | (_) - ~~~~~ | \ / | ___ _ __| |_ _ __ - ~~~~~* | |\/| |/ _ \ '__| | | '_ \ - *~~~~~~~ | | | | __/ | | | | | | | - ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| - *~~~~~~~~~~~ - ~~~*~~~* Machine Learning for HPC Workflows - - - -[2020-02-07 13:06:23: WARNING] Workflow specification missing - encouraged 'merlin' section! Run 'merlin example' for examples. -Using default configuration with no sampling. -[2020-02-07 13:06:23: INFO] Study workspace is 'studies/simple_chain_20200207-130623'. -[2020-02-07 13:06:24: INFO] Reading app config from file ~/.merlin/app.yaml -[2020-02-07 13:06:25: INFO] broker: amqps://user:******@broker:5671//user -[2020-02-07 13:06:25: INFO] backend: redis://user:******@backend:6379/0 -[2020-02-07 13:06:25: INFO] Calculating task groupings from DAG. -[2020-02-07 13:06:25: INFO] Converting graph to celery tasks. -[2020-02-07 13:06:25: INFO] Launching tasks. diff --git a/docs/source/modules/hello_world/run_workers_out.txt b/docs/source/modules/hello_world/run_workers_out.txt deleted file mode 100644 index 431dc769c..000000000 --- a/docs/source/modules/hello_world/run_workers_out.txt +++ /dev/null @@ -1,21 +0,0 @@ - - - * - *~~~~~ - *~~*~~~* __ __ _ _ - / ~~~~~ | \/ | | (_) - ~~~~~ | \ / | ___ _ __| |_ _ __ - ~~~~~* | |\/| |/ _ \ '__| | | '_ \ - *~~~~~~~ | | | | __/ | | | | | | | - ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| - *~~~~~~~~~~~ - ~~~*~~~* Machine Learning for HPC Workflows - - - -[2020-02-07 13:14:38: INFO] Launching workers from 'hello.yaml' -[2020-02-07 13:14:38: WARNING] Workflow specification missing - encouraged 'merlin' section! Run 'merlin example' for examples. -Using default configuration with no sampling. -[2020-02-07 13:14:38: INFO] Starting celery workers -[2020-02-07 13:14:38: INFO] ['celery worker -A merlin -n default_worker.%%h -l INFO -Q merlin'] diff --git a/docs/source/modules/hello_world/stop_workers.txt b/docs/source/modules/hello_world/stop_workers.txt deleted file mode 100644 index e1a63b17c..000000000 --- a/docs/source/modules/hello_world/stop_workers.txt +++ /dev/null @@ -1,24 +0,0 @@ -$ merlin stop-workers - - - * - *~~~~~ - *~~*~~~* __ __ _ _ - / ~~~~~ | \/ | | (_) - ~~~~~ | \ / | ___ _ __| |_ _ __ - ~~~~~* | |\/| |/ _ \ '__| | | '_ \ - *~~~~~~~ | | | | __/ | | | | | | | - ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| - *~~~~~~~~~~~ - ~~~*~~~* Machine Learning for HPC Workflows - - - -[2020-03-06 09:20:08: INFO] Stopping workers... -[2020-03-06 09:20:08: INFO] Reading app config from file .merlin/app.yaml -[2020-03-06 09:20:08: INFO] broker: amqps://user:******@server:5671//user -[2020-03-06 09:20:08: INFO] backend: redis://mlsi:******@server:6379/0 -all_workers: ['celery@default_worker.%machine'] -spec_worker_names: [] -workers_to_stop: ['celery@default_worker.%machine'] -[2020-03-06 09:20:10: INFO] Sending stop to these workers: ['celery@default_worker.%machine'] diff --git a/docs/source/modules/installation/app_docker_rabbit.yaml b/docs/source/modules/installation/app_docker_rabbit.yaml deleted file mode 100644 index d9ca5360a..000000000 --- a/docs/source/modules/installation/app_docker_rabbit.yaml +++ /dev/null @@ -1,10 +0,0 @@ -broker: - name: rabbitmq - server: my-rabbit - password: ~/.merlin/rabbit.pass - -results_backend: - name: redis - server: my-redis - port: 6379 - db_num: 0 diff --git a/docs/source/modules/installation/app_docker_redis.yaml b/docs/source/modules/installation/app_docker_redis.yaml deleted file mode 100644 index a4a9d76e4..000000000 --- a/docs/source/modules/installation/app_docker_redis.yaml +++ /dev/null @@ -1,11 +0,0 @@ -broker: - name: redis - server: my-redis - port: 6379 - db_num: 0 - -results_backend: - name: redis - server: my-redis - port: 6379 - db_num: 0 diff --git a/docs/source/modules/installation/app_local_redis.yaml b/docs/source/modules/installation/app_local_redis.yaml deleted file mode 100644 index 88332e28d..000000000 --- a/docs/source/modules/installation/app_local_redis.yaml +++ /dev/null @@ -1,11 +0,0 @@ -broker: - name: redis - server: localhost - port: 6379 - db_num: 0 - -results_backend: - name: redis - server: localhost - port: 6379 - db_num: 0 diff --git a/docs/source/modules/installation/docker-compose.yml b/docs/source/modules/installation/docker-compose.yml deleted file mode 100644 index 1ebb9d8fc..000000000 --- a/docs/source/modules/installation/docker-compose.yml +++ /dev/null @@ -1,23 +0,0 @@ -version: '3' - -networks: - mernet: - driver: bridge - -services: - redis: - image: 'redis:latest' - container_name: my-redis - ports: - - "6379:6379" - networks: - - mernet - - merlin: - image: 'llnl/merlin' - container_name: my-merlin - tty: true - volumes: - - ~/merlinu/:/home/merlinu - networks: - - mernet diff --git a/docs/source/modules/installation/docker-compose_rabbit.yml b/docs/source/modules/installation/docker-compose_rabbit.yml deleted file mode 100644 index f8c34a99f..000000000 --- a/docs/source/modules/installation/docker-compose_rabbit.yml +++ /dev/null @@ -1,46 +0,0 @@ -version: '3' - -networks: - mernet: - driver: bridge - -services: - redis: - image: 'redis:latest' - container_name: my-redis - ports: - - "6379:6379" - networks: - - mernet - - rabbitmq: - image: rabbitmq:3-management - container_name: my-rabbit - tty: true - ports: - - "15672:15672" - - "15671:15671" - - "5672:5672" - - "5671:5671" - environment: - - RABBITMQ_SSL_CACERTFILE=/cert_rabbitmq/ca_certificate.pem - - RABBITMQ_SSL_KEYFILE=/cert_rabbitmq/server_key.pem - - RABBITMQ_SSL_CERTFILE=/cert_rabbitmq/server_certificate.pem - - RABBITMQ_SSL_VERIFY=verify_none - - RABBITMQ_SSL_FAIL_IF_NO_PEER_CERT=false - - RABBITMQ_DEFAULT_USER=merlinu - - RABBITMQ_DEFAULT_VHOST=/merlinu - - RABBITMQ_DEFAULT_PASS=guest - volumes: - - ~/merlinu/cert_rabbitmq:/cert_rabbitmq - networks: - - mernet - - merlin: - image: 'llnl/merlin' - container_name: my-merlin - tty: true - volumes: - - ~/merlinu/:/home/merlinu - networks: - - mernet diff --git a/docs/source/modules/installation/docker-compose_rabbit_redis_tls.yml b/docs/source/modules/installation/docker-compose_rabbit_redis_tls.yml deleted file mode 100644 index b80b71dc7..000000000 --- a/docs/source/modules/installation/docker-compose_rabbit_redis_tls.yml +++ /dev/null @@ -1,38 +0,0 @@ -version: '3' - -networks: - mernet: - driver: bridge - -services: - redis: - image: 'redis' - container_name: my-redis - command: - - --port 0 - - --tls-port 6379 - - --tls-ca-cert-file /cert_redis/ca_certificate.pem - - --tls-key-file /cert_redis/server_key.pem - - --tls-cert-file /cert_redis/server_certificate.pem - - --tls-auth-clients no - ports: - - "6379:6379" - volumes: - - "~/merlinu/cert_redis:/cert_redis" - networks: - - mernet - - rabbitmq: - image: rabbitmq:3-management - container_name: my-rabbit - tty: true - ports: - - "15672:15672" - - "15671:15671" - - "5672:5672" - - "5671:5671" - volumes: - - "~/merlinu/rabbbitmq.conf:/etc/rabbitmq/rabbitmq.conf" - - "~/merlinu/cert_rabbitmq:/cert_rambbitmq" - networks: - - mernet diff --git a/docs/source/modules/installation/installation.rst b/docs/source/modules/installation/installation.rst deleted file mode 100644 index 96195ff3d..000000000 --- a/docs/source/modules/installation/installation.rst +++ /dev/null @@ -1,340 +0,0 @@ -Installation -============ -.. admonition:: Prerequisites - - * shell (bash, csh, etc, if running on Windows, use a linux container) - * python3 >= python3.6 - * pip3 - * wget - * build tools (make, C/C++ compiler) - * (OPTIONAL) docker (required for :doc:`Module 4: Run a Real Simulation<../run_simulation/run_simulation>`) - * (OPTIONAL) file editor for docker config file editing - -.. admonition:: Estimated time - - * 20 minutes - -.. admonition:: You will learn - - * How to install merlin in a virtual environment using pip. - * How to install a container platform eg. singularity, docker, or podman. - * How to configure merlin. - * How to test/verify the installation. - -.. contents:: Table of Contents: - :local: - -This section details the steps necessary to install merlin and its dependencies. -Merlin will then be configured for the local machine and the configuration -will be checked to ensure a proper installation. - - -Installing Merlin ------------------ - -A merlin installation is required for the subsequent modules of this tutorial. - -Once merlin is installed, it requires servers to operate. While you are able to host your own servers, -we will use merlin's containerized servers in this tutorial. However, if you prefer to host your own servers -you can host a redis server that is accessible to your current machine. -Your computer/organization may already have a redis server available you can use, please check -with your local system administrator. - -Create a virtualenv using python3 to install merlin. - -.. code-block:: bash - - python3 -m venv --prompt merlin merlin_venv - -Activate the virtualenv. - -.. code-block:: bash - - source merlin_venv/bin/activate - or - source merlin_venv/bin/activate.csh - - -The ``(merlin) `` will appear after activating. - -You should upgrade pip and setuptools before proceeding. - -.. code-block:: bash - - pip3 install setuptools pip -U - -Install merlin through pip. - -.. code-block:: bash - - pip3 install merlin - -Check to make sure merlin installed correctly. - -.. code-block:: bash - - which merlin - -You should see that it was installed in your virtualenv, like so: - -.. code-block:: bash - - ~//merlin_venv/bin/merlin - -If this is not the output you see, you may need to restart your virtualenv and try again. - -When you are done with the virtualenv you can deactivate it using ``deactivate``, -but leave the virtualenv activated for the subsequent steps. - -.. code-block:: bash - - deactivate - - -Redis Server -++++++++++++ - -A redis server is required for the celery results backend server, this same server -can also be used for the celery broker. We will be using merlin's containerized server -however we will need to download one of the supported container platforms avaliable. For -the purpose of this tutorial we will be using singularity. - -.. code-block:: bash - - # Update and install singularity dependencies - apt-get update && apt-get install -y \ - build-essential \ - libssl-dev \ - uuid-dev \ - libgpgme11-dev \ - squashfs-tools \ - libseccomp-dev \ - pkg-config - - # Download dependency go - wget https://go.dev/dl/go1.18.1.linux-amd64.tar.gz - - # Extract go into local - tar -C /usr/local -xzf go1.18.1.linux-amd64.tar.gz - - # Remove go tar file - rm go1.18.1.linux-amd64.tar.gz - - # Update PATH to include go - export PATH=$PATH:/usr/local/go/bin - - # Download singularity - wget https://github.com/sylabs/singularity/releases/download/v3.9.9/singularity-ce-3.9.9.tar.gz - - # Extract singularity - tar -xzf singularity-ce-3.9.9.tar.gz - - # Configure and install singularity - cd singularity-ce-3.9.9 - ./mconfig && \ - make -C ./builddir && \ - sudo make -C ./builddir install - -Configuring Merlin ------------------- -Merlin requires a configuration script for the celery interface. -Run this configuration method to create the ``app.yaml`` -configuration file. - -.. code-block:: bash - - merlin config --broker redis - -The ``merlin config`` command above will create a file called ``app.yaml`` -in the ``~/.merlin`` directory. -If you are running a redis server locally then you are all set, look in the ``~/.merlin/app.yaml`` file -to see the configuration, it should look like the configuration below. - -.. literalinclude:: ./app_local_redis.yaml - :language: yaml - -More detailed information on configuring Merlin can be found in the :doc:`configuration section<../../merlin_config>`. - -.. _Verifying installation: - -Checking/Verifying Installation -------------------------------- - -First launch the merlin server containers by using the ``merlin server`` commands - -.. code-block:: bash - - merlin server init - merlin server start - -A subdirectory called ``merlin_server/`` will have been created in the current run directory. -This contains all of the proper configuration for the server containers merlin creates. -Configuration can be done through the ``merlin server config`` command, however users have -the flexibility to edit the files directly in the directory. Additionally an preconfigured ``app.yaml`` -file has been created in the ``merlin_server/`` subdirectory to utilize the merlin server -containers . To use it locally simply copy it to the run directory with a cp command. - -.. code-block:: bash - - cp ./merlin_server/app.yaml . - -You can also make this server container your main server configuration by replacing the one located in your home -directory. Make sure you make back-ups of your current app.yaml file in case you want to use your previous -configurations. Note: since merlin servers are created locally on your run directory you are allowed to create -multiple instances of merlin server with their unique configurations for different studies. Simply create different -directories for each study and run ``merlin server init`` in each directory to create an instance for each. - -.. code-block:: bash - - mv ~/.merlin/app.yaml ~/.merlin/app.yaml.bak - cp ./merlin_server/app.yaml ~/.merlin/ - -The ``merlin info`` command will check that the configuration file is -installed correctly, display the server configuration strings, and check server -access. - -.. code-block:: bash - - merlin info - -If everything is set up correctly, you should see: - -.. code-block:: bash - - . - . - . - - Merlin Configuration - ------------------------- - - config_file | /.merlin/app.yaml - is_debug | False - merlin_home | /.merlin - merlin_home_exists | True - broker server | redis://localhost:6379/0 - results server | redis://localhost:6379/0 - - - Checking server connections: - ---------------------------- - broker server connection: OK - results server connection: OK - - Python Configuration - ------------------------- - . - . - . - - -(OPTIONAL) Docker Advanced Installation ---------------------------------------- - -RabbitMQ Server -+++++++++++++++ - -This optional section details the setup of a rabbitmq server for merlin. -A rabbitmq server can be started to provide the broker, the redis -server will still be required for the backend. Merlin is configured -to use ssl encryption for all communication with the rabbitmq server. -An ssl server requires ssl certificates to encrypt the communication through -the python ssl module `python ssl `_ . -This tutorial can use self-signed certificates created by the user for use -in the rabbitmq server. -The rabbitmq server uses Transport Layer Security (TLS) -(often known as "Secure Sockets Layer"). -Information on rabbitmq -with TLS can be found here: `rabbit TLS `_ - -A set of self-signed keys is created through the ``tls-gen`` package. -These keys are then copied to a common directory for use in the rabbitmq -server and python. - -.. code-block:: bash - - git clone https://github.com/michaelklishin/tls-gen.git - cd tls-gen/basic - make CN=my-rabbit CLIENT_ALT_NAME=my-rabbit SERVER_ALT_NAME=my-rabbit - make verify - mkdir -p ${HOME}/merlinu/cert_rabbitmq - cp result/* ${HOME}/merlinu/cert_rabbitmq - - -The rabbitmq docker service can be added to the previous -``docker-compose.yml`` file. - -.. literalinclude:: ./docker-compose_rabbit.yml - :language: yaml - - -When running the rabbitmq broker server, the config can be created with -the default ``merlin config`` command. -If you have already run the previous command then remove the -``~/.merlin/app.yaml`` or -``~/merlinu/.merlin/app.yaml`` file , and run the ``merlin config`` -command again. - -.. code-block:: bash - - merlin config - -The app.yaml file will need to be edited to add the rabbitmq settings -in the broker section -of the app.yaml file. The ``server:`` should be changed to ``my-rabbit``. -The rabbitmq server will be accessed on the default TLS port, 5671. - -.. literalinclude:: ./app_docker_rabbit.yaml - :language: yaml - -To complete the config create a file ``~/merlinu/.merlin/rabbit.pass`` -and add the password ``guest``. - -The aliases defined previously can be used with this set of docker containers. - -Redis TLS Server -++++++++++++++++ - -This optional section details the setup of a redis server with TLS for merlin. -The reddis TLS configuration can be found in the :ref:`broker_redis_ssl` section. -A newer redis (version 6 or greater) must be used to enable TLS. - -A set of self-signed keys is created through the ``tls-gen`` package. -These keys are then copied to a common directory for use in the redis -server and python. - -.. code-block:: bash - - git clone https://github.com/michaelklishin/tls-gen.git - cd tls-gen/basic - make CN=my-redis CLIENT_ALT_NAME=my-redis SERVER_ALT_NAME=my-redis - make verify - mkdir -p ${HOME}/merlinu/cert_redis - cp result/* ${HOME}/merlinu/cert_redis - - -The configuration below does not use client -verification ``--tls-auth-clients no`` so the ssl files do not need to -be defined as shown in the :ref:`broker_redis_ssl` section. - -.. literalinclude:: ./docker-compose_rabbit_redis_tls.yml - :language: yaml - -The ``rabbitmq.conf`` file contains the configuration, including ssl, for -the rabbitmq server. - -.. code-block:: bash - - default_vhost = /merlinu - default_user = merlinu - default_pass = guest - listeners.ssl.default = 5671 - ssl.options.ccertfile = /cert_rabbitmq/ca_certificate.pem - ssl.options.certfile = /cert_rabbitmq/server_certificate.pem - ssl.options.keyfile = /cert_rabbitmq/server_key.pem - ssl.options.verify = verify_none - ssl.options.fail_if_no_peer_cert = false - -Once this docker-compose file is run, the merlin ``app.yaml`` file is changed -to use the redis TLS server ``rediss`` instead of ``redis``. - diff --git a/docs/source/modules/introduction.rst b/docs/source/modules/introduction.rst deleted file mode 100644 index 5b224c605..000000000 --- a/docs/source/modules/introduction.rst +++ /dev/null @@ -1,476 +0,0 @@ -Introduction -============ -This module introduces you to Merlin, some of the technology behind it, -and how it works. - -.. admonition:: Prerequisites - - * Curiosity - -.. admonition:: Estimated time - - * 20 minutes - -.. admonition:: You will learn - - * What Merlin is and why you might consider it - * Why it was built and what are some target use cases - * How it is designed and what the underlying tech is - -.. contents:: Table of Contents: - :local: - -What is Merlin? -+++++++++++++++ - -.. admonition:: Summary - - Merlin is a toolkit designed to enable HPC-focused simulation workflows - with distributed cloud compute technologies. This helps simulation workflows - push to immense scale. (Like `100 million`__.) - -__ https://arxiv.org/abs/1912.02892 - -At its core, Merlin translates a text-based, command-line focused workflow -description into a set of discrete tasks. These tasks live on a centralized -broker (e.g. a separate server) that persists outside of your HPC -batch allocation. Autonomous workers in different allocations (even -on different machines) can then connect -to this server, pull off and execute these tasks asynchronously. - -Why Merlin? What's the need? -++++++++++++++++++++++++++++ - -That sounds complicated. Why would you care to do this? - -The short answer: machine learning - -The longer answer: machine learning and data science are becoming -an integral part of scientific inquiry. The problem is that machine learning -models are data hungry: it takes lots and lots of simulations to train machine -learning models on their outputs. Unfortunately HPC systems were designed to execute -a few large hero simulations, not many smaller simulations. Naively pushing -standard HPC workflow tools to hundreds of thousands and millions of simulations -can lead to some serious problems. - -Workflows, applications and machines are becoming more complex, but -subject matter experts need to devote time and attention to their applications -and often require fine command-line level control. Furthermore, -they rarely have the time to devote to learning workflow systems. - -With the expansion of data-driven computing, the HPC scientist needs to be able -to run more simulations through complex multi-component workflows. - -**Merlin targets HPC workflows that require many simulations**. These include: - - -.. list-table:: Merlin Targeted Use Cases - :widths: 25 75 - - * - Emulator building - - Running enough simulations to build an emulator (or "surrogate model") - of an expensive computer code, such as needed for uncertainty quantification - * - Iterative sampling - - Executing some simulations and then choosing new ones to run - based on the results obtained thus far - * - Active learning - - Iteratively sampling coupled with emulator building to efficiently train - a machine learning model - * - Design optimization - - Using a computer code to optimize a model design, perhaps robustly or under - uncertainty - * - Reinforcement learning - - Building a machine learning model by subsequently exposing it to lots of - trials, giving it a reward/penalty for the outcomes of those trials - * - Hierarchical simulation - - Running low-fidelity simulations to inform which higher fidelity simulations - to execute - * - Heterogeneous workflows - - Workflows that require different steps to execute on different hardware and/or - systems - -Many scientific and engineering problems require running lots of simulations. -But accomplishing these tasks -effectively in an unstable bleeding edge HPC environment can be dicey. The tricks -that work for 100 simulations won't work for -`10 thousand `_, let alone -`100 million `_. - -We made Merlin to make high-frequency extreme scale computing easy. - - -How can Merlin run so many simulations? -+++++++++++++++++++++++++++++++++++++++ - -The good news is that distributed cloud compute technology has really pushed the -frontier of scalability. Merlin helps bring this tech to traditional scientific HPC. - -Traditionally, HPC workflow systems tie workflow steps to HPC resources and -coordinate the execution of tasks and management of resources one of two ways: - -.. |ext-img| image:: ../../images/external_coordination.png - - -.. |int-img| image:: ../../images/internal_coordination.png - -.. table:: Traditional HPC Workflow Philosophies - - +------------------------------+-------------------------------------------------------+ - | External Coordination + - Separate batch jobs for each task | - | |ext-img| + - External daemon tracks dependencies and jobs | - | + - Progress monitored with periodic polling | - | + (of files or batch system) | - +------------------------------+-------------------------------------------------------+ - | Internal Coordination + - Multiple tasks bundled into larger batch jobs | - | |int-img| + - Internal daemon tracks dependencies and resources | - | + - Progress monitored via polling | - | + (of filesystem or message passing) | - +------------------------------+-------------------------------------------------------+ - - - -**External coordination** ties together independent batch jobs each executing workflow -sub-tasks with an external monitor. This monitor could be a daemon -or human that monitors either the batch or file system via periodic polling and orchestrates task launch dependencies. - -External coordination can tailor the resources to the task, but cannot easily -run lots of concurrent simulations (since batch systems usually limit the number -of jobs a user can queue at once). - -**Internal coordination** puts the monitor within a larger batch job that allocates -resources inside that job for the specific tasks at hand. - -Internal coordination can run many more -concurrent tasks by bundling smaller jobs into larger jobs, but cannot tailor the -resources to the task at hand. This precludes workflows that, for instance, require -one step on CPU hardware and another on a GPU machine. - -Instead of tying resources to tasks, Merlin does this: - - -.. |cent-img| image:: ../../images/central_coordination.png - -.. table:: Merlin's Workflow Philosophy - - - +------------------------------+-----------------------------------------------+ - + Centralized Coordination + - Batch jobs and workers decoupled from tasks + - + of Producers & Consumers + - Centralized queues visible to multiple jobs + - + |cent-img| + - Progress and dependencies handled via + - + + direct worker connections to central + - + + message server and results database + - +------------------------------+-----------------------------------------------+ - -Merlin decouples workflow tasks from workflow resources. - -Merlin avoids a command-and-control approach to HPC resource -management for a workflow. Instead of having the workflow coordinator -ask for and manage HPC resources and tasks, the Merlin coordinator just manages -tasks. Task-agnostic resources can then independently connect (and -disconnect) to the coordinator. - -In Merlin, this **producer-consumer workflow** happens through two commands: - -``merlin run `` (producer) - -and - -``merlin run-worker `` (consumer). - -The ``merlin run`` command populates the central queue(s) with work to do -and the ``merlin run-worker`` command drains the queue(s) by executing the -task instructions. Each new instance of ``merlin run-worker`` creates a new -consumer. These consumers can exist on different machines in different -batch allocations, anywhere that can see the central server. Likewise -``merlin run`` can populate the queue from any system that can see the -queue server, including other workers. In principle, this means a -researcher can push new work onto an already running batch allocation of workers, -or re-direct running jobs to work on higher-priority work. - -.. admonition:: The benefits of producer-consumer workflows - - The increased flexibility that comes from - decoupling *what* HPC applications you run from *where* you run them - can be extremely enabling. - - Merlin allows you to - - * Scale to very large number of simulations by avoiding common HPC bottlenecks - * Automatically take advantage of free nodes to process your workflow faster - * Create iterative workflows, like as needed for active machine learning - * Dynamically add more tasks to already-running jobs - * Have cross-machine and cross-batch-job workflows, with different steps - executing on different resources, but still coordinated - -The producer-consumer approach to workflows -allows for increased flexibility and scalability. For this -reason it has become a mainstay of cloud-compute microservices, which -allow for extremely distributed asynchronous computing. - -Many asynchronous task and workflow systems exist, but the majority are -focused around this microservices model, where a system is set up (and -managed) by experts that build a single workflow. This static workflow -gets tested and hardened and exists as a service for their users -(e.g. an event on a website triggers a discrete set of tasks). -HPC, and in particular *scientific* HPC -brings its own set of challenges that make a direct application of microservices -to HPC workflows challenging. - - -.. list-table:: Challenges for bringing microservices to scientific HPC Workflows - :widths: 50 50 - :header-rows: 1 - - * - Challenge - - Requirement - * - Workflows can change from day-to-day as researchers explore new simulations, - configurations, and questions. - - *Workflows need to be dynamic, not static.* - * - Workflow components are usually different executables, - pre- and post-processing scripts and data aggregation steps - written in different languages. - - *Workflows need to intuitively support multiple languages.* - * - These components often need command-line-level control of task instructions. - - *Workflows need to support shell syntax and environment variables.* - * - Components frequently require calls to a batch system scheduler for parallel job - execution. - - *Workflows need a natural way to launch parallel jobs that use more resources - then a single worker.* - * - Tasks can independently create large quantities of data. - - *Dataflow models could be bottlenecks. Workflows should take advantage of - parallel file systems.* - * - HPC systems (in particular leadership class machines) can experience unforeseen - outages. - - *Workflows need to be able to restart, retry and rerun failed steps without - needing to run the entire workflow.* - -Merlin was built specifically to address the challenges of porting microservices -to HPC simulations. - -So what exactly does Merlin do? -+++++++++++++++++++++++++++++++ - -Merlin wraps a heavily tested and well used asynchronous task queuing library in -a skin and syntax that is natural for HPC simulations. In essence, we extend -`maestro `_ by hooking it up to -`celery `_. We leverage -maestro's HPC-friendly workflow description language and translate it to -discrete celery tasks. - -Why not just plain celery? - -Celery is extremely powerful, but this power can be a barrier for many science -and engineering subject matter experts, -who might not be python coders. While this may not be -an issue for web developers, it presents a serious challenge to many scientists -who are used to running their code from a shell command line. By wrapping celery -commands in maestro steps, we not only create a familiar environment for users -(since maestro steps look like shell commands), but we also create structure -around celery dependencies. Maestro also has interfaces to common batch schedulers -(e.g. `slurm `_ -and `flux `_)[*]_ for parallel job control. - -So why Merlin and not just plain maestro? - -The main reason: to run lots of simulations for machine learning -applications. Basically **Merlin scales maestro.** - -Maestro follows an external coordinator model. Maestro workflow DAGs -(directed acyclic graphs) need to be unrolled (concretized) -ahead of time, so that batch dependencies can be calculated and managed. -This graph problem becomes very expensive as the number of tasks approaches -a few hundred. (Not to mention most batch systems will prevent a user -from queuing more than a few hundred concurrent batch jobs.) In other words, -using maestro alone to run thousands of simulations is not practical. - -But with celery, we can *dynamically* create additional -tasks. This means that the DAG can get unrolled by the very -same workers that will execute the tasks, offering a natural parallelism -(i.e. much less waiting before starting the work). - -What does this mean in practice? - -*Merlin can quickly queue a lot of simulations.* - -How quickly? The figure below shows task queuing rates when pushing -:doc:`a simple workflow<./hello_world/hello_world>` on the -`Quartz Supercomputer `_ -to 40 million samples. This measures how quickly simulation ensembles of various -sample sizes can get enqueued. - -.. image:: ../../images/task_creation_rate.png - -As you can see, by exploiting celery's dynamic task queuing (tasks that create -tasks), Merlin can enqueue hundreds of thousands of -simulations per second. These jobs can then be consumed in parallel, -at a rate that depends on the number of workers you have. - -Furthermore, this ability to dynamically add tasks to the queue means -that workflows can become more flexible and responsive. A worker executing -a step can launch additional workflows without having to stand up resources -to execute and monitor the execution of those additional steps. - -The only downside to being able to enqueue work this quickly is the inability -of batch schedulers to keep up. This is why we recommend pairing Merlin with -`flux `_, which results in a scalable but easy-to-use -workflow system: - -- Maestro describes the workflow tasks -- Merlin orchestrates the task executions -- Flux schedules the HPC resources - -Here's an example of how Merlin, maestro and flux can all work together -to launch a workflow on multiple machines. - -.. image:: ../../images/merlin_arch.png - -The scientist describes her workflow with a maestro-like ````. Her workflow -consists of two steps: - -1. Run many parallel CPU-only jobs, varying her simulation parameters of interest -2. Use a GPU to train a deep learning model on the results of those simulations - -She then types ``merlin run ``, which translates that maestro file -into celery commands and -sends those tasks to two separate queues on a centralized server (one for CPU work and -one for GPU work). - -She then launches a batch allocation on the CPU machine, which contains the command -``merlin run-workers --steps 1``. -Workers start up under flux, pull work from the server's CPU queue and call flux to -launch the parallel simulations asynchronously. - -She also launches a separate batch request on the GPU machine with -``merlin run-workers --steps 2``. These workers connect to the central -queue associated with the GPU step. - -When the simulations in step 1 finish, step 2 will automatically start. In this fashion, -Merlin allows the scientist to coordinate a highly scalable asynchronous multi-machine -heterogeneous workflow. - -This is of course a simple example, but it does show how the producer-consumer -philosophy in HPC workflows can be quite enabling. Merlin's goal is to make it easy -for HPC-focused subject matter experts to take advantage of the advances in cloud -computing. - - -How is it designed? -+++++++++++++++++++ - -Merlin leverages a number of open source technologies, developed and battle-hardened -in the world of distributed computing. We decided to do this instead of -having to build, test and maintain -stand-alone customized (probably buggy) versions of software that will probably not -be as fully featured. - -There are differing philosophies on how much third-party software to rely upon. -On the one hand, building our system off ubiquitous open source message passing libraries -increases the confidence in our -software stack's performance, especially at scale (for instance, -celery is robust enough to `keep Instagram running `_). -However, doing so means that when something breaks deep down, it can -be difficult to fix (if at all). Indeed if there's an underlying "feature" that we'd -like to work around, we could be stuck. Furthermore, the complexity of the software -stack can be quite large, such that our team couldn't possibly keep track of it all. -These are valid concerns; however, we've found it much easier to quickly develop a -portable system with a small team by treating (appropriately chosen) third party -libraries as underlying infrastructure. (Sure you *could* build and use your own -compiler, but *should* you?) - -Merlin manages the increased risk that comes with relying on software that is out of -our control by: - -1. Building modular software that can easily be reconfigured / swapped for other tech -2. Participating as developers for those third-party packages upon which rely - (for instance we often kick enhancements and bug fixes to maestro) -3. Using continuous integration and testing to catch more errors as they occur - -This section talks about some of those underlying technologies, what they are, and -why they were chosen. - -*A brief technical dive into some underlying tech* - -Merlin extends `maestro `_ with -`celery `_, which in turn can -be configured to interface with a variety of `message queue brokers `_ and `results backends `_. In practice, we like to use -`RabbitMQ `_ and `Redis `_ for our broker -and backend respectively, because of their features and reliability, especially at scale. - -.. list-table:: Key Merlin Tech Components - :widths: 25 75 - :header-rows: 1 - - * - Component - - Reasoning - * - `maestro `_ - - shell-like workflow descriptions, batch system interfaces - * - `celery `_ - - highly scalable, supports multiple brokers and backends - * - `RabbitMQ `_ - - resilience, support for multiple users and queues - * - `Redis `_ - - database speed, scalability - * - `cryptography `_ - - secure Redis results - * - `flux `_ (optional) - - portability and scalability of HPC resource allocation - -The different components interact to populate and drain the message queue broker of -workflow tasks. - -.. image:: ../../images/merlin_run.png - :align: center - -When a call is made to ``merlin run``, maestro turns the workflow description (composed of "steps" with "parameters" and "samples") into a task -dependency graph. Merlin translates this graph into discrete celery task commands [*]_ - -Calls to ``merlin run-worker`` cause celery workers to connect to both the message broker -and results database. The workers pull tasks from the broker and begin to execute -the instructions therein. -When finished, a worker posts the results (task status -metadata, such as "SUCCESS" or "FAIL") to the results database and -automatically grabs another task from the queue. -When additional workers come along (through other explicit calls to ``merlin run-worker``), -they connect to the broker and help out with the workflow. - -*Multiple vs. Single Queues* - -RabbitMQ brokers can have multiple distinct queues. To take advantage of this feature, -Merlin lets you assign workflow steps and workers to different queues. (Steps must be assigned to a single queue, but workers -can connect to multiple queues at once.) The advantage of a single queue is simplicity, -both in workflow design and scalability. However, having multiple queues allows for -prioritization of work (the express checkout lane at the grocery store) and customization -of workers (specialized assembly line workers tailored for a specific task). - - -What is in this Tutorial? -+++++++++++++++++++++++++ - -This tutorial will show you how to: - - -* :doc:`Install Merlin<./installation/installation>` - and test that it works correctly -* :doc:`Build a basic workflow<./hello_world/hello_world>` - and scale it up, introducing you to - Merlin's syntax and how it differs from maestro. -* :doc:`Run a "real" physics simulation<./run_simulation/run_simulation>` - based workflow, with post-processing of results, visualization - and machine learning. -* :doc:`Use Merlin's advanced features<./advanced_topics/advanced_topics>` - to do things like interface with batch systems, distribute a workflow across - machines and dynamically add new samples to a running workflow. -* :doc:`Contribute to Merlin<./contribute>`, - through code enhancements and bug reports. -* :doc:`Port your own application<./port_your_application>`, - with tips and tricks for building and scaling up workflows. - - -.. rubric:: Footnotes - -.. [*] The flux and slurm interfaces used by Merlin differ - from the versions bundled with maestro to decouple job launching from - batch submission. -.. [*] Technically Merlin creates celery tasks that will break up the graph into - subsequent tasks (tasks to create tasks). This improves scalability with parallel - task creation. diff --git a/docs/source/modules/port_your_application.rst b/docs/source/modules/port_your_application.rst deleted file mode 100644 index 0dd501c2e..000000000 --- a/docs/source/modules/port_your_application.rst +++ /dev/null @@ -1,70 +0,0 @@ -Port Your Own Application -========================= -.. admonition:: Prerequisites - - * :doc:`Module 2: Installation` - * :doc:`Module 3: Hello World` - * :doc:`Module 4: Running a Real Simulation` - -.. admonition:: Estimated time - - * 15 minutes - -.. admonition:: You will learn - - * Tips for building workflows - * Tips for scaling - * Debugging - -.. contents:: Table of Contents: - :local: - - -Tips for porting your app, building workflows -+++++++++++++++++++++++++++++++++++++++++++++ - -The first step of building a new workflow, or porting an existing app to a workflow, is to describe it as a set of discrete, and ideally focused steps. Decoupling the steps and making them generic when possible will facilitate more rapid composition of future workflows. This will also require mapping out the dependencies and parameters that get passed between/shared across these steps. - -Setting up a template using tools such as `cookiecutter `_ can be useful for more production style workflows that will be frequently reused. Additionally, make use of the built-in examples accessible from the merlin command line with ``merlin example``. - -.. (machine learning applications on different data sets?) - -Use dry runs ``merlin run --dry --local`` to prototype without actually populating task broker's queues. Similarly, once the dry run prototype looks good, try it on a small number of parameters before throwing millions at it. - -Merlin inherits much of the input language and workflow specification philosophy from `Maestro `_. Thus a good first step is to learn to use that tool. As seen in the :doc:`Module 5: Advanced Topics` there are also use cases that combine Merlin and Maestro. - -.. send signal to workers alloc ends -> what was this referring to? - -Make use of exit keys such as ``MERLIN_RESTART`` or ``MERLIN_RETRY`` in your step logic. - -Tips for debugging your workflows -+++++++++++++++++++++++++++++++++ - -The scripts defined in the workflow steps are also written to the output directories; this is a useful debugging tool as it can both catch parameter and variable replacement errors, as well as provide a quick way to reproduce, edit, and retry the step offline before fixing the step in the workflow specification. The ``.out`` and ``.err`` files log all of the output to catch any runtime errors. Additionally, you may need to grep for ``'WARNING'`` and ``'ERROR'`` in the worker logs. - -.. where are the worker logs, and what might show up there that .out and .err won't see? -> these more developer focused output? - -When a bug crops up in a running study with many parameters, there are a few other commands to make use of. Rather than trying to spam ``Ctrl-c`` to kill all the workers, you will want to instead use ``merlin stop-workers --spec .yaml`` to stop the workers for that workflow. This should then be followed up with ``merlin purge .yaml`` to clear out the task queue to prevent the same -buggy tasks from continuing to run the next time ``run-workers`` is invoked. - -.. last item from board: use merlin status to see if have workers ... is that 'dangling tasks' in the image? - -Tips for scaling workflows -++++++++++++++++++++++++++ - -Most of the worst bottlenecks you'll encounter when scaling up your workflow are caused by the file system. This can be caused by using too much space or too many files, even in a single workflow if you're not careful. There is a certain number of inodes created just based upon the sample counts even without accounting for the steps being executed. This can be mitigated by avoiding reading/writing to the file system when possible. If file creation is unavoidable, you may need to consider adding cleanup steps to your workflow: dynamically pack up the previous step in a tarball, transfer to another file system or archival system, or even just delete files. - -.. Making a temporary directory to run the main app in can be helpful for containing voluminous outputs and cleaning it up without risking any of the - -Misc tips -+++++++++ - -Avoid reliance upon storage at the ``$(SPECROOT)`` level. This is particularly dangerous if using symlinks as it can violate the provenance of what was run, possibly ruining the utility of the dataset that was generated. It is preferred to make local copies of any input decks and supporting scripts and data sets inside the workflows' workspace. This of course has limits, regarding shared/system libraries that any programs running in the steps may need; alternate means of recording this information in a log file or something similar may be needed in this case. - - -.. some other lines on the board that are hard to read.. - run your sim as ... - (mu !) p... -> need some other eyes on what that's supposed to be in image of notes - -.. standard data format discussion? hdf5? - this something we should be in the business of recommending? a lot will be dictated by what the 'big app' is doing anyway... diff --git a/docs/source/modules/run_simulation/run_simulation.rst b/docs/source/modules/run_simulation/run_simulation.rst deleted file mode 100644 index f48d7dc97..000000000 --- a/docs/source/modules/run_simulation/run_simulation.rst +++ /dev/null @@ -1,429 +0,0 @@ -Run a Real Simulation -===================== - -.. admonition:: Summary - - This module aims to do a parameter study on a well-known benchmark problem for - viscous incompressible fluid flow. - -.. admonition:: Prerequisites - - * :doc:`Module 0: Before you start<../before>` - * :doc:`Module 2: Installation<../installation/installation>` - * :doc:`Module 3: Hello World<../hello_world/hello_world>` - -.. admonition:: Estimated time - - * 60 minutes - -.. admonition:: You will learn - - * How to run the simulation OpenFOAM, using merlin. - * How to use machine learning on OpenFOAM results, using merlin. - -.. contents:: Table of Contents: - :local: - - -Introduction -++++++++++++ - -We aim to do a parameter study on the lid-driven cavity problem. - -.. list-table:: - - * - .. figure:: setup.png - - Fig 1. Lid-driven cavity problem setup - - - .. figure:: lid-driven-stable.png - - Fig 2. Example of a flow in steady state - -In this problem, we have a viscous fluid within a square cavity that has three non-slip -walls and one moving wall (moving lid). -We are interested in learning how varying the viscosity and lid speed affects -the average enstrophy and kinetic energy of the fluid after it reaches steady state. -We will be using the velocity squared as a proxy for kinetic energy. - -This module will be going over: - - * Setting up our inputs using the merlin block - * Running multiple simulations in parallel - * Combining the outputs of these simulations into a an array - * Predictive modeling and visualization - -.. _Before Moving On: - -Before Moving On -~~~~~~~~~~~~~~~~~ - -check that the virtual environment with merlin installed is activated -and that redis server is set up using this command: - -.. code:: bash - - $ merlin info - -This is covered more in depth here: :ref:`Verifying installation` - -There are two ways to do this example: with docker and without docker. To go through the version with docker, get the necessary files for this module by running: - -.. code-block:: bash - - $ merlin example openfoam_wf - - $ cd openfoam_wf/ - -For the version without docker you should run: - -.. code-block:: bash - - $ merlin example openfoam_wf_no_docker - - $ cd openfoam_wf_no_docker/ - -.. note:: - - From here on, this tutorial will focus solely on the docker version of running openfoam. However, the docker version of this tutorial is almost identical to the no docker version. If you're using the no docker version of this tutorial you can still follow along but check the openfoam_no_docker_template.yaml file in each step to see what differs. - -In the ``openfoam_wf`` directory you should see the following: - -.. figure:: openfoam_wf_output.png - - Fig 3. openfoam_wf directory structure - -* ``openfoam_wf.yaml`` -- this spec file is partially blank. You will fill in the gaps as you follow this module's steps. - -* ``openfoam_wf_template.yaml`` -- this is a complete spec file. You can always reference it as an example. - -* ``scripts`` -- This directory contains all the necessary scripts for this module. - - * We'll be exploring these scripts as we go with the tutorial. - -* ``requirements.txt`` -- this is a text file listing this workflow's python dependencies. - -**To start, open** ``openfoam_wf.yaml`` **using your favorite text editor.** - -It should look something like this: - -.. literalinclude:: ../../../../merlin/examples/workflows/openfoam_wf/openfoam_wf.yaml - :language: yaml - :caption: openfoam_wf.yaml - -Specification file -++++++++++++++++++ - -We are going to build a spec file that produces this DAG: - -.. figure:: openfoam_dag.png - :align: center - - Fig 4. Module 4 DAG - - -Variables -~~~~~~~~~ -First we specify some variables to make our life easier. Locate the ``env`` block -in our yaml spec - -.. code-block:: yaml - - env: - variables: - OUTPUT_PATH: ./openfoam_wf_output - - SCRIPTS: - N_SAMPLES: - -The ``OUTPUT_PATH`` variable is set to tell merlin where you want your output directory to be. -The default is ``_`` which in our case would simply be ``openfoam_wf_`` - -We'll fill out the next two variables as we go. - -Samples and scripts -~~~~~~~~~~~~~~~~~~~ -One merlin best practice is to copy any scripts your workflow may use from your ``SPECROOT`` directory into the ``MERLIN_INFO`` -directory. This is done to preserve the original scripts in case they are modified during the time merlin is running. -We will do that first. -We will put this in the merlin sample generation section, since it runs before anything else. - -Edit the merlin block to look like the following: - -.. code-block:: yaml - - merlin: - samples: - generate: - cmd: | - cp -r $(SPECROOT)/scripts $(MERLIN_INFO)/ - - # Generates the samples - python $(MERLIN_INFO)/scripts/make_samples.py -n 10 -outfile=$(MERLIN_INFO)/samples - file: $(MERLIN_INFO)/samples.npy - column_labels: [LID_SPEED, VISCOSITY] - -We will be using the scripts directory a lot so we'll set the variable ``SCRIPTS`` -to ``$(MERLIN_INFO)/scripts`` for convenience. We would also like to have a more central control over -the number of samples generated so we'll create an ``N_SAMPLES`` variable: - -.. code-block:: yaml - - env: - variables: - OUTPUT_PATH: ./openfoam_wf_output - SCRIPTS: $(MERLIN_INFO)/scripts - N_SAMPLES: 10 - -and update the merlin block to be: - -.. code-block:: yaml - - merlin: - samples: - generate: - cmd: | - cp -r $(SPECROOT)/scripts $(MERLIN_INFO)/ - - # Generates the samples - python $(SCRIPTS)/make_samples.py -n N_SAMPLES -outfile=$(MERLIN_INFO)/samples - file: $(MERLIN_INFO)/samples.npy - column_labels: [LID_SPEED, VISCOSITY] - -Just like in the :ref:`Using Samples` step of the hello world module, we -generate samples using the merlin block. We are only concerned with how the -variation of two initial conditions, lidspeed and viscosity, affects outputs of the system. -These are the ``column_labels``. -The ``make_samples.py`` script is designed to make log uniform random samples. -Now, we can move on to the steps of our study block. - -Setting up -~~~~~~~~~~ -Our first step in our study block is concerned with making sure we have all the -required python packages for this workflow. The specific packages are found in -the ``requirements.txt`` file. - -We will also need to copy the lid driven cavity deck from the OpenFOAM docker -container and adjust the write controls. This last part is scripted already for -convenience. - -Locate the ``setup`` step in the study block and edit it to look like the following: - -.. code-block:: yaml - - study: - - name: setup - description: | - Installs necessary python packages and imports the cavity directory - from the docker container - run: - cmd: | - pip install -r $(SPECROOT)/requirements.txt - - # Set up the cavity directory in the MERLIN_INFO directory - source $(SCRIPTS)/cavity_setup.sh $(MERLIN_INFO) - -This step does not need to be parallelized so we will assign it to lower -concurrency (a setting that controls how many workers can be running at the same time) - -Locate the ``resources`` section in the ``merlin`` block and edit the concurrency and add the setup step: - -.. code-block:: yaml - - resources: - workers: - nonsimworkers: - args: -l INFO --concurrency 1 - steps: [setup] - -Running the simulation -~~~~~~~~~~~~~~~~~~~~~~ - -Moving on to the ``sim_runs`` step, we want to: - - 1. Copy the cavity deck from the ``MERLIN_INFO`` directory into each of the current step's subdirectories - 2. Edit the default input values (lidspeed and viscosity) in these cavity decks using the ``sed`` command - 3. Run the simulation using the ``run_openfoam`` executable through the OpenFOAM docker container - 4. Post-process the results (also using the ``run_openfoam`` executable) - -This part should look like: - -.. code-block:: yaml - - - name: sim_runs - description: | - Edits the Lidspeed and viscosity then runs OpenFOAM simulation - using the icoFoam solver - run: - cmd: | - cp -r $(MERLIN_INFO)/cavity cavity/ - cd cavity - - ## Edits default values for viscosity and lidspeed with - # values specified by samples section of the merlin block - sed -i '' "18s/.*/nu [0 2 -1 0 0 0 0] $(VISCOSITY);/" constant/transportProperties - sed -i '' "26s/.*/ value uniform ($(LID_SPEED) 0 0);/" 0/U - - cd .. - cp $(SCRIPTS)/run_openfoam . - - # Creating a unique OpenFOAM docker container for each sample and using it to run the simulation - CONTAINER_NAME='OPENFOAM_ICO_$(MERLIN_SAMPLE_ID)' - docker container run -ti --rm -v $(pwd):/cavity -w /cavity --name=${CONTAINER_NAME} cfdengine/openfoam ./run_openfoam $(LID_SPEED) - docker wait ${CONTAINER_NAME} - depends: [setup] - task_queue: simqueue - -This step runs many simulations in parallel so it would run faster if we assign it -a worker with a higher concurrency. Navigate back to the ``resources`` section in the ``merlin`` block - -.. code-block:: yaml - - resources: - workers: - nonsimworkers: - args: -l INFO --concurrency 1 - steps: [setup] - simworkers: - args: -l INFO --concurrency 10 --prefetch-multiplier 1 -Ofair - steps: [sim_runs] - -The quantities of interest are the average enstrophy and kinetic energy at each cell. -The enstrophy is calculated through an OpenFOAM post processing function of the the flow -fields while the kinetic energy is approximated by calculated using the square of -the velocity vector at each grid point. The velocity field is normally -outputted normally as a result of running the default solver for this -particular problem. - -The ``run_openfoam`` executable calculates the appropriate timestep ``deltaT`` so that we -have a Courant number of less than 1. It also uses the ``icoFoam`` solver on the -cavity decks and gives us VTK files that are helpful for visualizing the flow fields -using visualization tools such as VisIt or ParaView. - -Combining outputs -~~~~~~~~~~~~~~~~~ -Navigate to the next step in our ``study`` block ``combine_outputs``. The purpose -of this step is to extracts the data from each of the simulation runs from -the previous step (``sim_runs``) and combines it for future use. - -The ``combine_outputs.py`` script in the ``$(SCRIPTS)`` directory is provided for -convenience. It takes two inputs. The first informs it of the base directory of the -``sim_runs`` directory and the second specifies the subdirectories for each run. -The script then goes into each of the directories and combines the velocity and -enstrophy for each timestep of each run in a .npz file. - -.. code-block:: yaml - - - name: combine_outputs - description: Combines the outputs of the previous step - run: - cmd: | - python $(SCRIPTS)/combine_outputs.py -data $(sim_runs.workspace) -merlin_paths $(MERLIN_PATHS_ALL) - depends: [sim_runs_*] - -This step depends on all the previous step's simulation runs which is why we -have the star. However, it does not need to be parallelized so we assign it to -the ``nonsimworkers`` in the ``workers`` section of the merlin block. - -.. code-block:: yaml - - workers: - nonsimworkers: - args: -l INFO --concurrency 1 - steps: [setup, combine_outputs] - -Machine Learning and visualization -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In the ``learn`` step, we want to: - - 1. Post-process the .npz file from the previous step. - 2. Learn the mapping between our inputs and chosen outputs - 3. Graph important features - -The provided ``learn.py`` script does all the above and outputs the trained -sklearn model and a png of the graphs plotted in the current directory. - -.. code-block:: yaml - - - name: learn - description: Learns the output of the openfoam simulations using input parameters - run: - cmd: | - python $(SCRIPTS)/learn.py -workspace $(MERLIN_WORKSPACE) - depends: [combine_outputs] - -This step is also dependent on the previous step for the .npz file and will only need -one worker therefore we will: - -.. code-block:: yaml - - nonsimworkers: - args: -l INFO --concurrency 1 - steps: [setup, combine_outputs, learn] - -Putting it all together -~~~~~~~~~~~~~~~~~~~~~~~ -By the end, your ``openfoam_wf.yaml`` should look like the template version in the same directory: - -.. literalinclude:: ../../../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml - :language: yaml - :caption: openfoam_wf_template.yaml - -Run the workflow -++++++++++++++++ -Now that you are done with the Specification file, use the following commands from inside -the ``openfoam_wf`` directory to run the workflow on our task server. - -.. note:: - - Running with fewer samples is the one of the best ways to debug - -.. code-block:: bash - - $ merlin run openfoam_wf.yaml - - $ merlin run-workers openfoam_wf.yaml - -But wait! We realize that 10 samples is not enough to train a good model. We would like -to restart with 100 samples instead of 10 (should take about 6 minutes): - -After sending the workers to start on their queues, we need to first stop the workers: - -.. code-block:: bash - - $ merlin stop-workers --spec openfoam_wf.yaml - -.. Why does it continue running even after merlin stop-workers? - -.. note:: - - * The --spec flag only stops workers from a specific YAML spec - -We stopped these tasks from running but if we were to run the workflow again -(with 100 samples instead of 10), we would continue running the 10 samples first! -This is because the queues are still filled with the previous attempt's tasks. -We need to purge these queues first in order to repopulate them with the appropriate -tasks. This is where we use the ``merlin purge`` command: - -.. code-block:: bash - - $ merlin purge openfoam_wf.yaml - -Now we are free to repopulate the queues with the 100 samples: - -.. code-block:: bash - - $ merlin run openfoam_wf.yaml --vars N_SAMPLES=100 - - $ merlin run-workers openfoam_wf.yaml - -To see your results, look inside the ``learn`` output directory. You should see a png that looks like this: - -.. image:: prediction.png - :align: center - - -.. admonition:: Related articles - - * https://cfd.direct/openfoam/user-guide/v6-cavity/ - * https://www.cfdengine.com/blog/how-to-install-openfoam-anywhere-with-docker/ diff --git a/docs/source/server/commands.rst b/docs/source/server/commands.rst deleted file mode 100644 index fc40bf182..000000000 --- a/docs/source/server/commands.rst +++ /dev/null @@ -1,94 +0,0 @@ -Merlin Server Commands -====================== - -Merlin server has a list of commands for interacting with the broker and results server. -These commands allow the user to manage and monitor the exisiting server and create -instances of servers if needed. - -Initializing Merlin Server (``merlin server init``) ---------------------------------------------------- -The merlin server init command creates configurations for merlin server commands. - -A main merlin sever configuration subdirectory is created in "~/.merlin/server" which contains -configuration for local merlin configuration, and configurations for different containerized -services that merlin server supports, which includes singularity (docker and podman implemented -in the future). - -A local merlin server configuration subdirectory called "merlin_server/" will also -be created when this command is run. This will contain a container for merlin server and associated -configuration files that might be used to start the server. For example, for a redis server a "redis.conf" -will contain settings which will be dynamically loaded when the redis server is run. This local configuration -will also contain information about currently running containers as well. - -Note: If there is an exisiting subdirectory containing a merlin server configuration then only -missing files will be replaced. However it is recommended that users backup their local configurations. - - -Checking Merlin Server Status (``merlin server status``) --------------------------------------------------------- - -Displays the current status of the merlin server. - -Starting up a Merlin Server (``merlin server start``) ------------------------------------------------------ - -Starts the container located in the local merlin server configuration. - -.. note:: - If this command seems to hang and never release control back to you, follow these steps: - - 1. Kill the command with ``Ctrl+C`` - 2. Run either ``export LC_ALL="C.UTF-8"`` or ``export LC_ALL="C"`` - 3. Re-run the ``merlin server start`` command - -Stopping an exisiting Merlin Server (``merlin server stop``) ------------------------------------------------------------- - -Stop any exisiting container being managed and monitored by merlin server. - -Restarting a Merlin Server instance (``merlin server restart``) ---------------------------------------------------------------- - -Restarting an existing container that is being managed and monitored by merlin server. - -Configurating Merlin Server instance (``merlin server config``) ---------------------------------------------------------------- -Place holder for information regarding merlin server config command - -Possible Flags - -.. code-block:: none - - -ip IPADDRESS, --ipaddress IPADDRESS - Set the binded IP address for the merlin server - container. (default: None) - -p PORT, --port PORT Set the binded port for the merlin server container. - (default: None) - -pwd PASSWORD, --password PASSWORD - Set the password file to be used for merlin server - container. (default: None) - --add-user ADD_USER ADD_USER - Create a new user for merlin server instance. (Provide - both username and password) (default: None) - --remove-user REMOVE_USER - Remove an exisiting user. (default: None) - -d DIRECTORY, --directory DIRECTORY - Set the working directory of the merlin server - container. (default: None) - -ss SNAPSHOT_SECONDS, --snapshot-seconds SNAPSHOT_SECONDS - Set the number of seconds merlin server waits before - checking if a snapshot is needed. (default: None) - -sc SNAPSHOT_CHANGES, --snapshot-changes SNAPSHOT_CHANGES - Set the number of changes that are required to be made - to the merlin server before a snapshot is made. - (default: None) - -sf SNAPSHOT_FILE, --snapshot-file SNAPSHOT_FILE - Set the snapshot filename for database dumps. - (default: None) - -am APPEND_MODE, --append-mode APPEND_MODE - The appendonly mode to be set. The avaiable options - are always, everysec, no. (default: None) - -af APPEND_FILE, --append-file APPEND_FILE - Set append only filename for merlin server container. - (default: None) - diff --git a/docs/source/server/configuration.rst b/docs/source/server/configuration.rst deleted file mode 100644 index 84429c079..000000000 --- a/docs/source/server/configuration.rst +++ /dev/null @@ -1,75 +0,0 @@ -Merlin Server Configuration -=========================== - -Below are a sample list of configurations for the merlin server command - -Main Configuration ``~/.merlin/server/`` ----------------------------------------- - -merlin_server.yaml - -.. code-block:: yaml - - container: - # Select the format for the recipe e.g. singularity, docker, podman (currently singularity is the only working option.) - format: singularity - # The image name - image: redis_latest.sif - # The url to pull the image from - url: docker://redis - # The config file - config: redis.conf - # Subdirectory name to store configurations Default: merlin_server/ - config_dir: merlin_server/ - # Process file containing information regarding the redis process - pfile: merlin_server.pf - - process: - # Command for determining the process of the command - status: pgrep -P {pid} #ps -e | grep {pid} - # Command for killing process - kill: kill {pid} - - -singularity.yaml - -.. code-block:: yaml - - singularity: - command: singularity - # init_command: \{command} .. (optional or default) - run_command: \{command} run {image} {config} - stop_command: kill # \{command} (optional or kill default) - pull_command: \{command} pull {image} {url} - - -Local Configuration ``merlin_server/`` --------------------------------------- - -redis.conf - -.. code-block:: yaml - - bind 127.0.0.1 -::1 - protected-mode yes - port 6379 - logfile "" - dir ./ - ... - -see documentation on redis configuration `here `_ for more detail - -merlin_server.pf - -.. code-block:: yaml - - bits: '64' - commit: '00000000' - hostname: ubuntu - image_pid: '1111' - mode: standalone - modified: '0' - parent_pid: 1112 - port: '6379' - version: 6.2.6 - diff --git a/docs/source/spack.rst b/docs/source/spack.rst deleted file mode 100644 index 93035a807..000000000 --- a/docs/source/spack.rst +++ /dev/null @@ -1,131 +0,0 @@ -Spack -===== - -The virtualenv method is not the only method to install merlin in a -separate python install. The spack method will build python and -all required modules for a specific set of configuration options. -These options include the compiler version, system type and python version. -Merlin will then be installed in this specific version allowing for -multiple python versions on a single system without the need for a -virtualenv. The py-merlin package builds with python3.6+. - - -Checkout spack -************** - - -Get the latest version of spack from Github. This is independent from -merlin so make sure merlin and spack are in separate directories. - -.. code:: bash - - git clone https://github.com/spack/spack.git - # The merlin spack package is in the develop branch - git checkout develop - - -Setup spack -*********** - -cd to spack directory - -Source the ``setup-env.sh`` or ``setup-env.csh``. This will put spack in -your path and setup module access for later use. This should be done every -time the modules are used. - -.. code:: bash - - source ./share/spack/setup-env.sh - -Add compilers if you haven't already: - -.. code:: bash - - spack compiler add - -To see the compilers. - -.. code:: bash - - spack compiler list - - -Build merlin -************ - -Build merlin, this will take a *long* time, be prepared to wait. It will -build python and all python modules merlin needs including numpy. - -.. code:: bash - - spack install py-merlin - - -The build will be done with the default compiler, in general this is the -newest gcc compiler. You can choose a different compiler by using the ``%`` -syntax, this will create an entirely separate build and module. - -.. code:: bash - - spack install py-merlin%gcc@7.1.0 - - -A different python version can be specified as part of the package config. -To build merlin with python-3.6.8 you would type: - -.. code:: bash - - spack install py-merlin^python@3.6.8 - -A tree of all of the packages and their dependencies needed to build the -merlin package can be shown by using the spec keyword. - -.. code:: bash - - spack spec py-merlin - - -Activate merlin -*************** - -To use merlin you can activate the module. - -.. code:: bash - - spack activate py-merlin - - or - - spack activate py-merlin%gcc@7.1.0 - - or - - spack activate py-merlin^python@3.6.8 - - -Load python -*********** - -The associated python module can then be loaded into your environment, this -will only work if you have sourced the setup-env.sh or setup-env.csh. - -.. code:: bash - - module avail python - - example: - ------ /spack/share/spack/modules/linux-rhel7-x86_64 ------- - python-3.6.8-gcc-8.1.0-4ilk3kn (L) - - -This will give you a list, the spack version will have a long hash -associated with the name. - -.. code:: bash - - module load python-3.6.8-- - e.g. - module load python-3.6.8-gcc-8.1.0-4ilk3kn - -At this point the module specific python, merlin, maestro and celery will -all be in your path. diff --git a/docs/source/virtualenv.rst b/docs/source/virtualenv.rst deleted file mode 100644 index 2eb33606f..000000000 --- a/docs/source/virtualenv.rst +++ /dev/null @@ -1,54 +0,0 @@ -Virtual environments -==================== - -This section provides a quick reference for using -`virtual environments `_ for the Merlin project. - - -Creating a virtual environment -++++++++++++++++++++++++++++++ - -To create a new virtual environment: - -.. code:: bash - - $ python3 -m venv venv - -.. caution:: A virtual environment will need to be created for each system type. It's - recommended to name the virtual environment `venv_` to make it easier to - switch between them. This documentation will use `venv` for simplicity to - reference the virtual environment. - -.. tip:: Virtual environments provide an isolated environment for working on Python - projects to avoid dependency conflicts. - - -Activating a Virtualenv ------------------------- - -Once the virtual environment is created it can be activated like so: - -.. code:: bash - - $ source venv/bin/activate - (venv) $ - -This will set the Python and Pip path to the virtual environment at ``venv/bin/python`` -and ``venv/bin/pip`` respectively. - -The virtual environment name should now display in the terminal, which means -it is active. Any calls to pip will install to the virtual environment. - -.. tip:: To verify that Python and Pip are pointing to the virtual environment, run - ``$ which python`` and ``$ which pip``. - - -Deactivating a Virtualenv ---------------------------- - -Virtualenvs can be exited via the following: - -.. code:: bash - - (venv) $ deactivate - $ diff --git a/docs/tutorial/0_prerequisites.md b/docs/tutorial/0_prerequisites.md new file mode 100644 index 000000000..69dff4509 --- /dev/null +++ b/docs/tutorial/0_prerequisites.md @@ -0,0 +1,36 @@ +# Before You Start + +It will be helpful to have these steps already completed before you +start the tutorial modules: + +* Make sure you have [python 3.8](https://www.python.org/downloads/release/python-380/) or newer. + +* Make sure you have [pip](https://www.pypi.org/project/pip/) version 22.3 or newer. + + * You can upgrade pip to the latest version with: + + ```bash + pip install --upgrade pip + ``` + + * OR you can upgrade to a specific version with: + + ```bash + pip install --upgrade pip==x.y.z + ``` + +* Make sure you have [GNU make tools](https://www.gnu.org/software/make/) and [compilers](https://gcc.gnu.org/). + +* (OPTIONAL) Install [docker](https://docs.docker.com/install/). + + * Download OpenFOAM image with: + + ```bash + docker pull cfdengine/openfoam + ``` + + * Download redis image with: + + ```bash + docker pull redis + ``` diff --git a/docs/tutorial/1_introduction.md b/docs/tutorial/1_introduction.md new file mode 100644 index 000000000..9422369e9 --- /dev/null +++ b/docs/tutorial/1_introduction.md @@ -0,0 +1,242 @@ +# Introduction + +This module introduces you to Merlin, some of the technology behind it, and how it works. + +!!! info "Estimated Time" + + 20 minutes + +!!! abstract "You Will Learn" + + - What Merlin is and why you might consider it + - Why it was built and what are some target use cases + - How it is designed and what the underlying tech is + +## What is Merlin? + +!!! abstract "Summary" + + Merlin is a toolkit designed to enable HPC-focused simulation workflows with distributed cloud compute technologies. This helps simulation workflows push to immense scale. (Like [100 million](https://arxiv.org/abs/1912.02892)) + +At its core, Merlin translates a text-based, command-line focused workflow description into a set of discrete tasks. These tasks live on a centralized broker (e.g. a separate server) that persists outside of your HPC batch allocation. Autonomous workers in different allocations (even on different machines) can then connect to this server, pull off and execute these tasks asynchronously. + +## Why Merlin? What's the Need? + +That sounds complicated. Why would you care to do this? + +The short answer: machine learning + +The longer answer: machine learning and data science are becoming an integral part of scientific inquiry. The problem is that machine learning models are data hungry: it takes lots and lots of simulations to train machine learning models on their outputs. Unfortunately HPC systems were designed to execute a few large hero simulations, not many smaller simulations. Naively pushing standard HPC workflow tools to hundreds of thousands and millions of simulations can lead to some serious problems. + +Workflows, applications and machines are becoming more complex, but subject matter experts need to devote time and attention to their applications and often require fine command-line level control. Furthermore, they rarely have the time to devote to learning workflow systems. + +With the expansion of data-driven computing, the HPC scientist needs to be able to run more simulations through complex multi-component workflows. + +**Merlin targets HPC workflows that require many simulations**. These include: + +| Simulation Type | Description | +| ----------------------- | ----------------------- | +| Emulator Building | Running enough simulations to build an emulator (or "surrogate model") of an expensive computer code, such as needed for uncertainty quantification | +| Iterative Sampling | Executing some simulations and then choosing new ones to run based on the results obtained thus far | +| Active Learning | Iteratively sampling coupled with emulator building to efficiently train a machine learning model | +| Design Optimization | Using a computer code to optimize a model design, perhaps robustly or under uncertainty | +| Reinforcement Learning | Building a machine learning model by subsequently exposing it to lots of trials, giving it a reward/penalty for the outcomes of those trials | +| Hierarchical Simulation | Running low-fidelity simulations to inform which higher fidelity simulations to execute | +| Heterogeneous Workflows | Workflows that require different steps to execute on different hardware and/or systems | + +Many scientific and engineering problems require running lots of simulations. But accomplishing these tasks effectively in an unstable bleeding edge HPC environment can be dicey. The tricks that work for 100 simulations won't work for [10 thousand](https://doi.org/10.1063/1.4977912), let alone [100 million](https://arxiv.org/abs/1912.02892). + +We made Merlin to make high-frequency extreme scale computing easy. + +## How Can Merlin Run so Many Simulations? + +The good news is that distributed cloud compute technology has really pushed the frontier of scalability. Merlin helps bring this tech to traditional scientific HPC. + +Traditionally, HPC workflow systems tie workflow steps to HPC resources and coordinate the execution of tasks and management of resources one of two ways: + ++--------------------------------------------------------------------------------------------+-------------------------------------------------------+ +| External Coordination + - Separate batch jobs for each task | +| ![External coordination](../assets/images/tutorial/introduction/external_coordination.png) + - External daemon tracks dependencies and jobs | +| + - Progress monitored with periodic polling | +| + (of files or batch system) | ++--------------------------------------------------------------------------------------------+-------------------------------------------------------+ +| Internal Coordination + - Multiple tasks bundled into larger batch jobs | +| ![Internal coordination](../assets/images/tutorial/introduction/internal_coordination.png) + - Internal daemon tracks dependencies and resources | +| + - Progress monitored via polling | +| + (of filesystem or message passing) | ++--------------------------------------------------------------------------------------------+-------------------------------------------------------+ + +**External coordination** ties together independent batch jobs each executing workflow sub-tasks with an external monitor. This monitor could be a daemon or human that monitors either the batch or file system via periodic polling and orchestrates task launch dependencies. + +External coordination can tailor the resources to the task, but cannot easily run lots of concurrent simulations (since batch systems usually limit the number of jobs a user can queue at once). + +**Internal coordination** puts the monitor within a larger batch job that allocates resources inside that job for the specific tasks at hand. + +Internal coordination can run many more concurrent tasks by bundling smaller jobs into larger jobs, but cannot tailor the resources to the task at hand. This precludes workflows that, for instance, require one step on CPU hardware and another on a GPU machine. + +Instead of tying resources to tasks, Merlin does this: + ++------------------------------------------------------------------------------------------+-----------------------------------------------+ +| Centralized Coordination of Producers & Consumers + - Batch jobs and workers decoupled from tasks | +| ![Central Coordination](../assets/images/tutorial/introduction/central_coordination.png) + - Centralized queues visible to multiple jobs | +| + - Progress and dependencies handled via | +| + direct worker connections to central | +| + message server and results database | ++------------------------------------------------------------------------------------------+-----------------------------------------------+ + +Merlin decouples workflow tasks from workflow resources. + +Merlin avoids a command-and-control approach to HPC resource management for a workflow. Instead of having the workflow coordinator ask for and manage HPC resources and tasks, the Merlin coordinator just manages tasks. Task-agnostic resources can then independently connect (and disconnect) to the coordinator. + +In Merlin, this **producer-consumer workflow** happens through two commands: + +=== "Producer" + ```bash + merlin run SPECIFICATION + ``` + +=== "Consumer" + ```bash + merlin run-workers SPECIFICATION + ``` + +The `merlin run` command populates the central queue(s) with work to do and the `merlin run-worker` command drains the queue(s) by executing the task instructions. Each new instance of `merlin run-worker` creates a new consumer. These consumers can exist on different machines in different batch allocations, anywhere that can see the central server. Likewise `merlin run` can populate the queue from any system that can see the queue server, including other workers. In principle, this means a researcher can push new work onto an already running batch allocation of workers, or re-direct running jobs to work on higher-priority work. + +!!! info "The Benefits of Producer-Consumer Workflows" + + The increased flexibility that comes from decoupling *what* HPC applications you run from *where* you run them can be extremely enabling. + + Merlin allows you to + + - Scale to very large number of simulations by avoiding common HPC bottlenecks + - Automatically take advantage of free nodes to process your workflow faster + - Create iterative workflows, like as needed for active machine learning + - Dynamically add more tasks to already-running jobs + - Have cross-machine and cross-batch-job workflows, with different steps executing on different resources, but still coordinated + +The producer-consumer approach to workflows allows for increased flexibility and scalability. For this reason it has become a mainstay of cloud-compute microservices, which allow for extremely distributed asynchronous computing. + +Many asynchronous task and workflow systems exist, but the majority are focused around this microservices model, where a system is set up (and managed) by experts that build a single workflow. This static workflow gets tested and hardened and exists as a service for their users (e.g. an event on a website triggers a discrete set of tasks). HPC, and in particular *scientific* HPC brings its own set of challenges that make a direct application of microservices to HPC workflows challenging. + +Some challenges for bringing microservices to scientific HPC workflows include: + +| Challenge | Requirement | +| --------- | ----------- | +| Workflows can change from day-to-day as researchers explore new simulations, configurations, and questions | *Workflows need to be dynamic, not static* | +| Workflow components are usually different executables, pre- and post-processing scripts and data aggregation steps written in different languages | *Workflows need to intuitively support multiple languages* | +| These components often need command-line-level control of task instructions | *Workflows need to support shell syntax and environment variables* | +| Components frequently require calls to a batch system scheduler for parallel job execution | *Workflows need a natural way to launch parallel jobs that use more resources than a single worker* | +| Tasks can independently create large quantities of data | *Dataflow models could be bottlenecks. Workflows should take advantage of parallel file systems* | +| HPC systems (in particular leadership class machines) can experience unforeseen outages | *Workflows need to be able to restart, retry and rerun failed steps without needing to run the entire workflow* | + +Merlin was built specifically to address the challenges of porting microservices to HPC simulations. + +## So What Exactly Does Merlin Do? + +Merlin wraps a heavily tested and well used asynchronous task queuing library in a skin and syntax that is natural for HPC simulations. In essence, we extend [Maestro](https://maestrowf.readthedocs.io/en/latest/) by hooking it up to [Celery](https://docs.celeryproject.org/en/latest/index.html). We leverage Maestro's HPC-friendly workflow description language and translate it to discrete Celery tasks. + +Why not just plain Celery? + +Celery is extremely powerful, but this power can be a barrier for many science and engineering subject matter experts, who might not be python coders. While this may not be an issue for web developers, it presents a serious challenge to many scientists who are used to running their code from a shell command line. By wrapping Celery commands in Maestro steps, we not only create a familiar environment for users (since Maestro steps look like shell commands), but we also create structure around Celery dependencies. Maestro also has interfaces to common batch schedulers (e.g. [Slurm](https://slurm.schedmd.com/documentation.html) and [Flux](http://flux-framework.org))[^1] for parallel job control. + +So why Merlin and not just plain Maestro? + +The main reason: to run lots of simulations for machine learning applications. Basically **Merlin scales Maestro.** + +Maestro follows an external coordinator model. Maestro workflow DAGs (directed acyclic graphs) need to be unrolled (concretized) ahead of time, so that batch dependencies can be calculated and managed. This graph problem becomes very expensive as the number of tasks approaches a few hundred. (Not to mention most batch systems will prevent a user from queuing more than a few hundred concurrent batch jobs.) In other words, using Maestro alone to run thousands of simulations is not practical. + +But with Celery, we can *dynamically* create additional tasks. This means that the DAG can get unrolled by the very same workers that will execute the tasks, offering a natural parallelism (i.e. much less waiting before starting the work). + +What does this mean in practice? + +*Merlin can quickly queue a lot of simulations.* + +How quickly? The figure below shows task queuing rates when pushing [a simple workflow](3_hello_world.md) on the [Quartz Supercomputer](https://hpc.llnl.gov/hardware/platforms/Quartz) to 40 million samples. This measures how quickly simulation ensembles of various sample sizes can get enqueued. + +![Task Creation Rate](../assets/images/tutorial/introduction/task_creation_rate.png) + +As you can see, by exploiting Celery's dynamic task queuing (tasks that create tasks), Merlin can enqueue hundreds of thousands of simulations per second. These jobs can then be consumed in parallel, at a rate that depends on the number of workers you have. + +Furthermore, this ability to dynamically add tasks to the queue means that workflows can become more flexible and responsive. A worker executing a step can launch additional workflows without having to stand up resources to execute and monitor the execution of those additional steps. + +The only downside to being able to enqueue work this quickly is the inability of batch schedulers to keep up. This is why we recommend pairing Merlin with [Flux](http://flux-framework.org), which results in a scalable but easy-to-use workflow system: + +- Maestro describes the workflow tasks +- Merlin orchestrates the task executions +- Flux schedules the HPC resources + +Here's an example of how Merlin, Maestro, and Flux can all work together to launch a workflow on multiple machines. + +![Merlin Architecture](../assets/images/merlin_arch.png) + +The scientist describes her workflow with a Maestro-like specification. Her workflow consists of two steps: + +1. Run many parallel CPU-only jobs, varying her simulation parameters of interest +2. Use a GPU to train a deep learning model on the results of those simulations + +She then types `merlin run SPECIFICATION`, which translates that Maestro spec into Celery commands and sends those tasks to two separate queues on a centralized server (one for CPU work and one for GPU work). + +She then launches a batch allocation on the CPU machine, which contains the command `merlin run-workers SPECIFICATION --steps 1`. Workers start up under Flux, pull work from the server's CPU queue and call Flux to launch the parallel simulations asynchronously. + +She also launches a separate batch request on the GPU machine with `merlin run-workers SPECIFICATION --steps 2`. These workers connect to the central queue associated with the GPU step. + +When the simulations in step 1 finish, step 2 will automatically start. In this fashion, Merlin allows the scientist to coordinate a highly scalable asynchronous multi-machine heterogeneous workflow. + +This is of course a simple example, but it does show how the producer-consumer philosophy in HPC workflows can be quite enabling. Merlin's goal is to make it easy for HPC-focused subject matter experts to take advantage of the advances in cloud computing. + + +## How is it Designed? + +Merlin leverages a number of open source technologies, developed and battle-hardened in the world of distributed computing. We decided to do this instead of having to build, test and maintain stand-alone customized (probably buggy) versions of software that will probably not be as fully featured. + +There are differing philosophies on how much third-party software to rely upon. On the one hand, building our system off ubiquitous open source message passing libraries increases the confidence in our software stack's performance, especially at scale (for instance, Celery is robust enough to [keep Instagram running](https://scaleyourapp.com/instagram-architecture-how-does-it-store-search-billions-of-images/)). However, doing so means that when something breaks deep down, it can be difficult to fix (if at all). Indeed if there's an underlying "feature" that we'd like to work around, we could be stuck. Furthermore, the complexity of the software stack can be quite large, such that our team couldn't possibly keep track of it all. These are valid concerns; however, we've found it much easier to quickly develop a portable system with a small team by treating (appropriately chosen) third party libraries as underlying infrastructure. (Sure you *could* build and use your own compiler, but *should* you?) + +Merlin manages the increased risk that comes with relying on software that is out of our control by: + +1. Building modular software that can easily be reconfigured / swapped for other tech +2. Participating as developers for those third-party packages upon which rely (for instance we often kick enhancements and bug fixes to Maestro) +3. Using continuous integration and testing to catch more errors as they occur + +This section talks about some of those underlying technologies, what they are, and why they were chosen. + +*A brief technical dive into some underlying tech* + +Merlin extends [Maestro](https://maestrowf.readthedocs.io/en/latest/) with [Celery](https://docs.celeryproject.org/en/latest/index.html), which in turn can be configured to interface with a variety of [message queue brokers and results backends](https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/index.html). In practice, we like to use [RabbitMQ](https://www.rabbitmq.com) and [Redis](https://redis.io) for our broker and backend respectively, because of their features and reliability, especially at scale. + +| Component | Reasoning | +| --------- | --------- | +| [Maestro](https://maestrowf.readthedocs.io/en/latest/) | Shell-like workflow descriptions, batch system interfaces | +| [Celery](https://docs.celeryproject.org/en/latest/index.html) | Highly scalable, supports multiple brokers and backends | +| [RabbitMQ](https://www.rabbitmq.com) | Resilience, support for multiple users and queues | +| [Redis](https://redis.io) | Database speed, scalability | +| [Cryptography](https://cryptography.io/en/latest/) | Secure Redis results | +| [Flux](http://flux-framework.org) | Portability and scalability of HPC resource allocation | + +The different components interact to populate and drain the message queue broker of workflow tasks. + +![Merlin Run Flowchart](../assets/images/tutorial/introduction/merlin_run.png) + +When a call is made to `merlin run`, Maestro turns the workflow description (composed of "steps" with "parameters" and "samples") into a task dependency graph. Merlin translates this graph into discrete Celery tasks[^2]. + +Calls to `merlin run-worker` cause Celery workers to connect to both the message broker and results database. The workers pull tasks from the broker and begin to execute the instructions therein. When finished, a worker posts the results (task status metadata, such as "SUCCESS" or "FAIL") to the results database and automatically grabs another task from the queue. When additional workers come along (through other explicit calls to `merlin run-worker`), they connect to the broker and help out with the workflow. + +*Multiple vs. Single Queues* + +RabbitMQ brokers can have multiple distinct queues. To take advantage of this feature, Merlin lets you assign workflow steps and workers to different queues. (Steps must be assigned to a single queue, but workers can connect to multiple queues at once.) The advantage of a single queue is simplicity, both in workflow design and scalability. However, having multiple queues allows for prioritization of work (the express checkout lane at the grocery store) and customization of workers (specialized assembly line workers tailored for a specific task). + + +## What is in This Tutorial? + +This tutorial will show you how to: + + +- [Install Merlin](2_installation.md) and test that it works correctly +- [Build a basic workflow](3_hello_world.md) and scale it up, introducing you to Merlin's syntax and how it differs from Maestro. +- [Run a "real" physics simulation](4_run_simulation.md) based workflow, with post-processing of results, visualization and machine learning. +- [Use Merlin's advanced features](5_advanced_topics.md) to do things like interface with batch systems, distribute a workflow across machines and dynamically add new samples to a running workflow. +- [Contribute to Merlin](6_contribute.md), through code enhancements and bug reports. +- [Port your own application](7_port_application.md), with tips and tricks for building and scaling up workflows. + + +[^1]: The Flux and Slurm interfaces used by Merlin differ from the versions bundled with Maestro to decouple job launching from batch submission. +[^2]: Technically Merlin creates Celery tasks that will break up the graph into subsequent tasks (tasks to create tasks). This improves scalability with parallel task creation. diff --git a/docs/tutorial/2_installation.md b/docs/tutorial/2_installation.md new file mode 100644 index 000000000..5883a8946 --- /dev/null +++ b/docs/tutorial/2_installation.md @@ -0,0 +1,516 @@ +# Installation + +!!! info "Prerequisites" + + - shell (bash, csh, etc, if running on Windows, use a linux container) + - python3 >= python3.8 + - pip3 + - wget + - build tools (make, C/C++ compiler) + - (OPTIONAL) docker (required for [4. Run a Real Simulation](4_run_simulation.md)) + - (OPTIONAL) file editor for docker config file editing + +!!! info "Estimated Time" + + 20 minutes + +!!! abstract "You Will Learn" + + - How to install Merlin in a virtual environment using pip. + - How to install a container platform eg. [Singularity](https://docs.sylabs.io/guides/latest/user-guide/), [Docker](https://www.docker.com/), or [Podman](https://podman.io/). + - How to configure Merlin. + - How to test/verify the installation. + +This section details the steps necessary to install Merlin and its dependencies. Merlin will then be configured for the local machine and the configuration will be checked to ensure a proper installation. + + +## Installing Merlin + +A Merlin installation is required for the subsequent modules of this tutorial. + +Once Merlin is installed, it requires servers to operate. While you are able to host your own servers, we will use Merlin's containerized servers in this tutorial. However, if you prefer to host your own servers you can host a Redis server that is accessible to your current machine. Your computer/organization may already have a Redis server available you can use, please check with your local system administrator. + +Create a virtualenv using python3 to install Merlin. + +```bash +python3 -m venv --prompt merlin merlin_venv +``` + +Activate the virtualenv. + +=== "bash" + + ```bash + source merlin_venv/bin/activate + ``` + +=== "csh" + + ```bash + source merlin_venv/bin/activate.csh + ``` + +The ``(merlin) `` will appear after activating. + +You should upgrade pip and setuptools before proceeding. + +```bash +pip3 install setuptools pip -U +``` + +Install Merlin through pip. + +=== "Latest Version" + + ```bash + pip3 install merlin + ``` + +=== "Specific Version" + + ```bash + pip3 install merlin==x.y.z + ``` + +Check to make sure Merlin installed correctly. + +```bash +which merlin +``` + +You should see that it was installed in your virtualenv, like so: + +!!! success + + ```bash + /merlin_venv/bin/merlin + ``` + +If this is not the output you see, you may need to restart your virtualenv and try again. + +You'll need the virtualenv activated for the subsequent steps in the tutorial. Once you've finished you can deactivate the virtual environment with: + +```bash +deactivate +``` + +## Redis Server + +A [Redis](https://redis.io/) server is required for the [Celery](https://docs.celeryq.dev/en/stable/index.html) results backend server, this same server can also be used for the Celery broker. We will be using Merlin's containerized server however we will need to download one of the supported container platforms avaliable. For the purpose of this tutorial we will be using [Singularity](https://docs.sylabs.io/guides/latest/user-guide/). + +### Installing Singularity + +Update and install Singularity dependencies: + +```bash +apt-get update && apt-get install -y build-essential libssl-dev uuid-dev libgpgme11-dev squashfs-tools libseccomp-dev pkg-config +``` + +Download dependency [go](https://go.dev/): + +```bash +wget https://go.dev/dl/go1.18.1.linux-amd64.tar.gz +``` + +Extract `go` into `local`: + +```bash +tar -C /usr/local -xzf go1.18.1.linux-amd64.tar.gz +``` + +Remove `go` tar file: + +```bash +rm go1.18.1.linux-amd64.tar.gz +``` + +Update `PATH` to include `go`: + +```bash +export PATH=$PATH:/usr/local/go/bin +``` + +Download Singularity: + +```bash +wget https://github.com/sylabs/singularity/releases/download/v3.9.9/singularity-ce-3.9.9.tar.gz +``` + +Extract Singularity: + +```bash +tar -xzf singularity-ce-3.9.9.tar.gz +``` + +Configure and Install Singularity: + +```bash +cd singularity-ce-3.9.9 ./mconfig && make -C ./builddir && sudo make -C ./builddir install +``` + +## Configuring Merlin + +Merlin requires a configuration script for the Celery interface in order to know which server(s) to connect to. Run this configuration method to create the `app.yaml` configuration file. + +```bash +merlin config --broker redis +``` + +The `merlin config` command above will create a file called `app.yaml` in the `~/.merlin` directory. If you are running a Redis server locally then you are all set, look in the `~/.merlin/app.yaml` file to see the configuration, it should look like the configuration below. + +???+ abstract "app.yaml" + + ```yaml + broker: + name: redis + server: localhost + port: 6379 + db_num: 0 + + results_backend: + name: redis + server: localhost + port: 6379 + db_num: 0 + ``` + +More detailed information on configuring Merlin can be found in the [Configuration](../user_guide/configuration/index.md) page. + +## Checking/Verifying Installation + +First launch the Merlin server containers by using the `merlin server` commands. + +Initialize the server files: + +```bash +merlin server init +``` + +This will create a `merlin_server/` folder in the current run directory. The structure of this folder will look like so: + +```bash +merlin_server/ +|-- redis.conf +|-- redis.pass +|-- redis.users +`-- redis_latest.sif +``` + +The files in this folder are: + +1. `redis.conf`: The Redis configuration file that contains all of the settings to be used for our Redis server +2. `redis.pass`: A password for the Redis server that we'll start up next +3. `redis.users`: A file defining the users that are allowed to access the Redis server and their permissions +4. `redis_latest.sif`: A singularity file that contains the latest Redis docker image that was pulled behind the scenes by Merlin + +If you'd like to modify the configuration of your server, you can either modify the files directly or use: + +```bash +merlin server config +``` + +Now that we have the necessary server files initialized, start the server: + +```bash +merlin server start +``` + +With this command, the containerized server should now be started. Notice that two new files were added to the `merlin_server` folder: + +1. `merlin_server.pf`: A process file containing information regarding the Redis process +2. `app.yaml`: A new `app.yaml` file configured specifically for the containerized Redis server that we just started + +To have Merlin read this configuration, copy it to your current run directory: + +```bash +cp merlin_server/app.yaml . +``` + +You can also make this server container your main server configuration by replacing the one located in your home directory. Make sure you make back-ups of your current `app.yaml` file in case you want to use your previous configurations. + +```bash +mv ~/.merlin/app.yaml ~/.merlin/app.yaml.bak +``` + +```bash +cp ./merlin_server/app.yaml ~/.merlin/ +``` + +!!! note + + Since Merlin servers are created locally on your run directory you are allowed to create multiple instances of Merlin server with their unique configurations for different studies. Simply create different directories for each study and run the following command in each directory to create an instance for each: + + ```bash + merlin server init + ``` + +The `merlin info` command will check that the configuration file is installed correctly, display the server configuration strings, and check server access. + +```bash +merlin info +``` + +If everything is set up correctly, you should see: + +???+ success "Expected Output for Successful Config" + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + Merlin Configuration + ------------------------- + + config_file | /path/to/app.yaml + is_debug | False + merlin_home | /path/to/.merlin + merlin_home_exists | True + broker server | redis://default:******@127.0.0.1:6379/0 + broker ssl | False + results server | redis://default:******@127.0.0.1:6379/0 + results ssl | False + + Checking server connections: + ---------------------------- + broker server connection: OK + results server connection: OK + + Python Configuration + ------------------------- + + $ which python3 + /path/to/python3 + + $ python3 --version + Python x.y.z + + $ which pip3 + /path/to/pip3 + + $ pip3 --version + pip x.y.x from /path/to/pip (python x.y) + + "echo $PYTHONPATH" + ``` + +## Docker Advanced Installation (Optional) + +This optional section details the setup of a RabbitMQ server and a Redis TLS (Transport Layer Security) server for Merlin. For this section, we'll start with the following `docker-compose.yml` file: + +???+ abstract "Initial Docker Compose" + + ```yaml title="docker-compose.yml" + version: '3' + + networks: + mernet: + driver: bridge + + services: + redis: + image: 'redis:latest' + container_name: my-redis + ports: + - "6379:6379" + networks: + - mernet + + merlin: + image: 'llnl/merlin' + container_name: my-merlin + tty: true + volumes: + - ~/merlinu/:/home/merlinu + networks: + - mernet + ``` + +### RabbitMQ Server + +A RabbitMQ server can be started to provide the broker, the Redis server will still be required for the backend. Merlin is configured to use ssl encryption for all communication with the RabbitMQ server. An ssl server requires ssl certificates to encrypt the communication through the python ssl module [python ssl](https://docs.python.org/3/library/ssl.html). This tutorial can use self-signed certificates created by the user for use in the RabbitMQ server. The RabbitMQ server uses TLS (often known as "Secure Sockets Layer"). Information on RabbitMQ with TLS can be found here: [RabbitMQ TLS](https://www.rabbitmq.com/ssl.html). + +A set of self-signed keys is created through the `tls-gen` package. These keys are then copied to a common directory for use in the RabbitMQ server and python. + +```bash +git clone https://github.com/michaelklishin/tls-gen.git +cd tls-gen/basic +make CN=my-rabbit CLIENT_ALT_NAME=my-rabbit SERVER_ALT_NAME=my-rabbit +make verify +mkdir -p ${HOME}/merlinu/cert_rabbitmq +cp result/* ${HOME}/merlinu/cert_rabbitmq +``` + +The RabbitMQ docker service can be added to the previous `docker-compose.yml` file: + +??? abstract "RabbitMQ Docker Compose" + + ```yaml title="docker-compose.yml" + version: '3' + + networks: + mernet: + driver: bridge + + services: + redis: + image: 'redis:latest' + container_name: my-redis + ports: + - "6379:6379" + networks: + - mernet + + rabbitmq: + image: rabbitmq:3-management + container_name: my-rabbit + tty: true + ports: + - "15672:15672" + - "15671:15671" + - "5672:5672" + - "5671:5671" + environment: + - RABBITMQ_SSL_CACERTFILE=/cert_rabbitmq/ca_certificate.pem + - RABBITMQ_SSL_KEYFILE=/cert_rabbitmq/server_key.pem + - RABBITMQ_SSL_CERTFILE=/cert_rabbitmq/server_certificate.pem + - RABBITMQ_SSL_VERIFY=verify_none + - RABBITMQ_SSL_FAIL_IF_NO_PEER_CERT=false + - RABBITMQ_DEFAULT_USER=merlinu + - RABBITMQ_DEFAULT_VHOST=/merlinu + - RABBITMQ_DEFAULT_PASS=guest + volumes: + - ~/merlinu/cert_rabbitmq:/cert_rabbitmq + networks: + - mernet + + merlin: + image: 'llnl/merlin' + container_name: my-merlin + tty: true + volumes: + - ~/merlinu/:/home/merlinu + networks: + - mernet + + ``` + +When running the RabbitMQ broker server, the config can be created with the default `merlin config` command. If you have already run the previous command then remove the `~/.merlin/app.yaml` or `~/merlinu/.merlin/app.yaml` file , and run the `merlin config` command again. + +```bash +merlin config +``` + +The `app.yaml` file will need to be edited to add the RabbitMQ settings in the broker section of the `app.yaml` file. The `server:` should be changed to `my-rabbit`. The RabbitMQ server will be accessed on the default TLS port, 5671. + +???+ abstract "RabbitMQ app.yaml" + + ```yaml title="app.yaml" + broker: + name: rabbitmq + server: my-rabbit + password: ~/.merlin/rabbit.pass + + results_backend: + name: redis + server: my-redis + port: 6379 + db_num: 0 + ``` + +To complete the config create a password file: + +```bash +touch ~/merlinu/.merlin/rabbit.pass +``` + +Then open the file and add the password `guest`. + +The aliases defined previously can be used with this set of docker containers. + +### Redis TLS Server + +This optional section details the setup of a Redis server with TLS for Merlin. The Redis TLS configuration can be found in the [Security With Redis](../user_guide/configuration/external_server.md#security-with-rediss_1) section. A newer Redis (version 6 or greater) must be used to enable TLS. + +A set of self-signed keys is created through the `tls-gen` package. These keys are then copied to a common directory for use in the Redis server and python. + +```bash +git clone https://github.com/michaelklishin/tls-gen.git +cd tls-gen/basic +make CN=my-redis CLIENT_ALT_NAME=my-redis SERVER_ALT_NAME=my-redis +make verify +mkdir -p ${HOME}/merlinu/cert_redis +cp result/* ${HOME}/merlinu/cert_redis +``` + +The configuration below does not use client verification `--tls-auth-clients no` so the ssl files do not need to be defined as shown in the [Security With Redis](../user_guide/configuration/external_server.md#security-with-rediss_1) section. + +??? abstract "RabbitMQ & Redis TLS Docker Compose" + + ```yaml title="docker-compose.yml" + version: '3' + + networks: + mernet: + driver: bridge + + services: + redis: + image: 'redis' + container_name: my-redis + command: + - --port 0 + - --tls-port 6379 + - --tls-ca-cert-file /cert_redis/ca_certificate.pem + - --tls-key-file /cert_redis/server_key.pem + - --tls-cert-file /cert_redis/server_certificate.pem + - --tls-auth-clients no + ports: + - "6379:6379" + volumes: + - "~/merlinu/cert_redis:/cert_redis" + networks: + - mernet + + rabbitmq: + image: rabbitmq:3-management + container_name: my-rabbit + tty: true + ports: + - "15672:15672" + - "15671:15671" + - "5672:5672" + - "5671:5671" + volumes: + - "~/merlinu/rabbbitmq.conf:/etc/rabbitmq/rabbitmq.conf" + - "~/merlinu/cert_rabbitmq:/cert_rambbitmq" + networks: + - mernet + ``` + +The `rabbitmq.conf` file contains the configuration, including ssl, for the RabbitMQ server. + +???+ abstract "RabbitMQ Config with SSL" + + ```title="rabbitmq.conf" + default_vhost = /merlinu + default_user = merlinu + default_pass = guest + listeners.ssl.default = 5671 + ssl.options.ccertfile = /cert_rabbitmq/ca_certificate.pem + ssl.options.certfile = /cert_rabbitmq/server_certificate.pem + ssl.options.keyfile = /cert_rabbitmq/server_key.pem + ssl.options.verify = verify_none + ssl.options.fail_if_no_peer_cert = false + ``` + +Once this docker-compose file is run, the Merlin `app.yaml` file is changed to use the Redis TLS server `rediss` instead of `redis`. diff --git a/docs/tutorial/3_hello_world.md b/docs/tutorial/3_hello_world.md new file mode 100644 index 000000000..e5ac6802a --- /dev/null +++ b/docs/tutorial/3_hello_world.md @@ -0,0 +1,478 @@ +# Hello, World! + +This hands-on module walks through the steps of building and running a simple Merlin workflow. + +!!! info "Prerequisites" + + [2. Installation](2_installation.md) + +!!! info "Estimated Time" + + 30 minutes + +!!! abstract "You Will Learn" + + - The components of a Merlin workflow specification + - How to run a simple Merlin workflow. + - How to interpret the results of your workflow. + +## Get Example Files + +Merlin comes with a built-in command `merlin example` to easily get a basic workflow up and running. To see a list of all the examples provided with Merlin you can run: + +```bash +merlin example list +``` + +For this tutorial we will be using the [`hello`](../examples/hello.md) example. Run the following commands: + +```bash +merlin example hello; cd hello/ +``` + +This will create and move into directory called `hello`, which contains these files: + +- `my_hello.yaml` -- this spec file is partially blank. You will fill in the gaps as you follow this module's steps. +- `hello.yaml` -- this is a complete spec without samples. You can always reference it as an example. +- `hello_samples.yaml` -- same as before, but with samples added. +- `make_samples.py` -- this is a small python script that generates samples. +- `requirements.txt` -- this is a text file listing this workflow's python dependencies. + +## Specification File + +Central to Merlin is something called a specification file, or a "spec" for short. The spec defines all aspects of your workflow. The spec is formatted in yaml. If you're unfamiliar with yaml, it's worth [reading up on](https://www.tutorialspoint.com/yaml/yaml_quick_guide.htm) for a few minutes. + +!!! warning + + Stray whitespace can break yaml; make sure your indentation is consistent. + +Let's build our spec piece by piece. For each spec section listed below, fill in the blank yaml entries of `my_hello.yaml` with the given material. + +### Section: `description` + +Just what it sounds like. Name and briefly summarize your workflow. + + +[](../../merlin/examples/workflows/hello/hello.yaml) lines:1-3 + + +### Section: `global.parameters` + +Global parameters are constants that you want to vary across simulations. Steps that contain a global parameter or depend on other steps that contain a global parameter are run for each index over parameter values. The label is the pattern for a filename that will be created for each value. For a more in-depth explanation of what parameters are, consult [Maestro's Docs](https://maestrowf.readthedocs.io/en/latest/Maestro/parameter_specification.html). + + +[](../../merlin/examples/workflows/hello/hello.yaml) lines:5-11 + + +!!! note + + `%%` is a special token that defines where the value in the label is placed. In this case the parameter labels will be GREET.hello, GREET.hola, etc. The label can take a custom text format, so long as the `%%` token is included to be able to substitute the parameter’s value in the appropriate place. + +So this will give us an English result and a Spanish result. You could add as many more languages as you want, as long as both parameters hold the same number of values. + +### Section: `study` + +This is where you define workflow steps. While the convention is to list steps as sequentially as possible, the only factor in determining step order is the dependency directed acyclic graph (DAG) created by the `depends` field. + + +```yaml +study: +- name: step_1 + description: say hello + run: + cmd: echo "$(GREET), $(WORLD)!" + +- name: step_2 + description: print a success message + run: + cmd: print("Hurrah, we did it!") + depends: [step_1] + shell: /usr/bin/env python3 +``` + +!!! tip + + The `-` denotes a list item in YAML. To add elements, simply add new elements prefixed with a hyphen + +`$(GREET)` and `$(WORLD)` expand the global parameters separately into their two values. `$(step_1.workspace)` gets the path to the output workspace of `step_1`. The default value for `shell` is `/bin/bash`. In `step_2` we override this to use python instead. Steps must be defined as nodes in a DAG, so no cyclical dependencies are allowed. Our step DAG currently looks like this: + +
+ ![Basic 2-Step DAG](../assets/images/tutorial/hello_world/dag1.png) +
Basic 2-Step DAG
+
+ +Since our global parameters have 2 values, this is actually what the DAG looks like: + +
+ ![2-Step DAG with Parameters](../assets/images/tutorial/hello_world/dag2.png) +
2-Step DAG with Parameters
+
+ +It looks like running `step_2` twice is redundant. Instead of doing that, we can collapse it back into a single step, by having it wait for both parameterized versions of `step_1` to finish. Add `_*` to the end of the step name in `step_1`'s depend entry. In other words, go from this: + +```yaml +depends: [step_1] +``` + +...to this: + +```yaml +depends: [step_1_*] +``` + +Now the DAG looks like this: + +
+ ![2-Step Funnel DAG](../assets/images/tutorial/hello_world/dag3.png) +
2-Step Funnel DAG
+
+ +Your full hello world spec `my_hello.yaml` should now look like this (an exact match of `hello.yaml`): + +???+ abstract "Full Hello Spec" + + + [hello.yaml](../../merlin/examples/workflows/hello/hello.yaml) + + +The order of the spec sections doesn't matter. + +!!! note + + At this point, `my_hello.yaml` is still maestro-compatible. The primary difference is that maestro won't understand anything in the `merlin` block, which we will add later. If you want to try it, run: + + ```bash + maestro run my_hello.yaml + ``` + +## Try It! + +First, we'll run Merlin locally. On the command line, run: + +```bash +merlin run --local my_hello.yaml +``` + +If your spec is bugless, you should see a few messages proclaiming successful step completion, like this (for now we'll ignore the warning): + +???+ success + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-19 17:41:02: INFO] Loading specification from path: /path/to/hello.yaml + [2023-12-19 17:41:02: WARNING] Workflow specification missing + encouraged 'merlin' section! Run 'merlin example' for examples. + Using default configuration with no sampling. + [2023-12-19 17:41:02: INFO] OUTPUT_PATH: hello + [2023-12-19 17:41:02: INFO] Study workspace is '/path/to/hello_20231219-174102'. + [2023-12-19 17:41:02: INFO] Reading app config from file /path/to/.merlin/app.yaml + [2023-12-19 17:41:02: INFO] Overriding default celery config with 'celery.override' in 'app.yaml': + visibility_timeout: 86400 + [2023-12-19 17:41:02: INFO] Calculating task groupings from DAG. + [2023-12-19 17:41:02: INFO] Converting graph to tasks. + [2023-12-19 17:41:02: INFO] Launching tasks. + WARNING:celery.backends.redis: + Setting ssl_cert_reqs=CERT_NONE when connecting to redis means that celery will not validate the identity of the redis broker when connecting. This leaves you vulnerable to man in the middle attacks. + + [2023-12-19 17:41:02: INFO] Executing step 'step_1_GREET.hello.WORLD.world' in '/path/to/hello_20231219-174102/step_1/GREET.hello.WORLD.world'... + [2023-12-19 17:41:02: INFO] Execution returned status OK. + [2023-12-19 17:41:02: INFO] Step 'step_1_GREET.hello.WORLD.world' in '/path/to/hello_20231219-174102/step_1/GREET.hello.WORLD.world' finished successfully. + [2023-12-19 17:41:02: INFO] Executing step 'step_1_GREET.hola.WORLD.mundo' in '/path/to/hello_20231219-174102/step_1/GREET.hola.WORLD.mundo'... + [2023-12-19 17:41:02: INFO] Execution returned status OK. + [2023-12-19 17:41:02: INFO] Step 'step_1_GREET.hola.WORLD.mundo' in '/path/to/hello_20231219-174102/step_1/GREET.hola.WORLD.mundo' finished successfully. + [2023-12-19 17:41:02: INFO] Executing step 'step_2' in '/path/to/hello_20231219-174102/step_2'... + [2023-12-19 17:41:02: INFO] Execution returned status OK. + [2023-12-19 17:41:02: INFO] Step 'step_2' in '/path/to/hello_20231219-174102/step_2' finished successfully. + ``` + +Great! But what happened? We can inspect the output directory to find out. + +Look for a directory named `hello_`. That's your output directory. Within, there should be a directory for each step of the workflow, plus one called `merlin_info`. The whole file tree looks like this: + +
+ ![File Tree for Hello Example](../assets/images/tutorial/hello_world/merlin_output.png) +
File Tree for Hello Example
+
+ +A lot of stuff, right? Here's what it means: + +* The 3 yaml files inside `merlin_info/` are called the provenance specs. They are copies of the original spec that was run, some showing under-the-hood variable expansions. + +* `MERLIN_FINISHED` files indicate that the step ran successfully. + +* `.sh` files contain the command for the step. + +* `.out` files contain the step's stdout. Look at one of these, and it should contain your "hello" message. + +* `.err` files contain the step's stderr. Hopefully empty, and useful for debugging. + +## Run Distributed! + +!!! warning "Important Note" + + Before trying this, make sure you've properly set up your Merlin config file `app.yaml`. If you can run `merlin info` and see no errors you should be good to go. Otherwise, see either the [Configuring Merlin](./2_installation.md#configuring-merlin) section of the installation step in the Tutorial or the [Configuration](../user_guide/configuration/index.md) page for more information. + +Now we will run the same workflow, but in parallel on our task server: + +```bash +merlin run my_hello.yaml +``` + +If your Merlin configuration is set up correctly, you should see something like this: + +!!! success "Output From Sending Tasks to the Server" + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-19 17:45:36: INFO] Loading specification from path: /path/to/hello.yaml + [2023-12-19 17:45:36: WARNING] Workflow specification missing + encouraged 'merlin' section! Run 'merlin example' for examples. + Using default configuration with no sampling. + [2023-12-19 17:45:36: INFO] OUTPUT_PATH: hello + [2023-12-19 17:45:36: INFO] Study workspace is '/path/to/hello_20231219-174536'. + [2023-12-19 17:45:36: INFO] Reading app config from file /path/to/.merlin/app.yaml + [2023-12-19 17:45:36: INFO] Overriding default celery config with 'celery.override' in 'app.yaml': + visibility_timeout: 86400 + [2023-12-19 17:45:36: INFO] Calculating task groupings from DAG. + [2023-12-19 17:45:36: INFO] Converting graph to tasks. + [2023-12-19 17:45:36: INFO] Launching tasks. + WARNING:celery.backends.redis: + Setting ssl_cert_reqs=CERT_NONE when connecting to redis means that celery will not validate the identity of the redis broker when connecting. This leaves you vulnerable to man in the middle attacks. + ``` + +That means we have launched our tasks! Now we need to launch the workers that will complete those tasks. Run this: + +```bash +merlin run-workers my_hello.yaml +``` + +Here's the expected Merlin output message for running workers: + +!!! success "Output From Running Workers" + + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2023-12-19 17:46:46: INFO] Loading specification from path: /path/to/hello.yaml + [2023-12-19 17:46:46: WARNING] Workflow specification missing + encouraged 'merlin' section! Run 'merlin example' for examples. + Using default configuration with no sampling. + [2023-12-19 17:46:46: INFO] Launching workers from '/path/to/hello.yaml' + [2023-12-19 17:46:46: INFO] Starting workers + [2023-12-19 17:46:46: INFO] Reading app config from file /path/to/.merlin/app.yaml + ``` + +Immediately after that, this will pop up: + +!!! success "Celery Workers Logs" + + ``` + -------------- celery@worker_name.%machine770 v5.3.4 (emerald-rush) + --- ***** ----- + -- ******* ---- Linux-4.18.0-513.9.1.1toss.t4.x86_64-x86_64-with-glibc2.28 2023-12-19 17:46:49 + - *** --- * --- + - ** ---------- [config] + - ** ---------- .> app: merlin:0x2aaab20619e8 + - ** ---------- .> transport: amqps://user:**@server:5671//user + - ** ---------- .> results: redis://user:**@server:6379/0 + - *** --- * --- .> concurrency: 36 (prefork) + -- ******* ---- .> task events: OFF (enable -E to monitor tasks in this worker) + --- ***** ----- + -------------- [queues] + .> [merlin]_merlin exchange=[merlin]_merlin(direct) key=[merlin]_merlin + + + [tasks] + . merlin.common.tasks.add_merlin_expanded_chain_to_chord + . merlin.common.tasks.expand_tasks_with_samples + . merlin.common.tasks.merlin_step + . merlin:chordfinisher + . merlin:queue_merlin_study + . merlin:shutdown_workers + + [2023-12-19 17:46:47,549: INFO] Connected to amqps://user:**@server:5671//user + [2023-12-19 17:46:47,599: INFO] mingle: searching for neighbors + [2023-12-19 17:46:48,807: INFO] mingle: sync with 2 nodes + [2023-12-19 17:46:48,807: INFO] mingle: sync complete + [2023-12-19 17:46:48,835: INFO] celery@worker_name.%machine770 ready. + ``` + +You may not see all of the info logs listed after the Celery C is displayed. If you'd like to see them you can change the Merlin workers' log levels with the `--worker-args` tag: + +```bash +merlin run-workers --worker-args "-l INFO" my_hello.yaml +``` + +The terminal you ran workers in is now being taken over by Celery, the powerful task queue library that Merlin uses internally. The workers will continue to report their task status here until their tasks are complete. + +Workers are persistent, even after work is done. Send a stop signal to all your workers with this command: + +```bash +merlin stop-workers +``` + +...and a successful worker stop will look like this, with the name of specific worker(s) reported: + +!!! success "Successful Worker Stop Output" + ``` + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + [2020-03-06 09:20:08: INFO] Stopping workers... + [2020-03-06 09:20:08: INFO] Reading app config from file /path/to/.merlin/app.yaml + [2020-03-06 09:20:09: INFO] Overriding default celery config with 'celery.override' in 'app.yaml': + visibility_timeout: 86400 + [2020-03-06 09:20:10: INFO] Sending stop to these workers: ['celery@machine_name.%machine'] + [2020-03-06 09:20:10: WARNING] Got shutdown from remote + ``` + +## Using Samples + +It's a little boring to say "hello world" in just two different ways. Let's instead say hello to many people! + +To do this, we'll need samples. Specifically, we'll change `WORLD` from a global parameter to a sample. While parameters are static, samples are generated dynamically, and can be more complex data types. In this case, `WORLD` will go from being "world" or "mundo" to being a randomly-generated name. + +First, we remove the global parameter `WORLD` so it does not conflict with our new sample. Parameters now look like this: + + +[](../../merlin/examples/workflows/hello/hello_samples.yaml) lines:9-12 + + +Next we'll add two new blocks to our spec: the `env` block and the `merlin` block. + +### Section: `env` + +To set up custom environment variables and other values that can be used throughout our spec we need to introduce a new `env` block to our spec file. Any variable defined here will remain constant throughout the spec. + +For this example, we'll add the following `env` block: + + +[](../../merlin/examples/workflows/hello/hello_samples.yaml) lines:5-7 + + +This makes `N_SAMPLES` into a user-defined variable that you can use elsewhere in your spec. + +### Section: `merlin` + +In addition to the `env` block, we'll also need to add the `merlin` block to our spec: + + +[](../../merlin/examples/workflows/hello/hello_samples.yaml) lines:27-32 + + +As you may have guessed, the `merlin` block is an exclusively Merlin feature. This block provides a way to generate samples for your workflow. In this case, a sample is the name of a person. + +For simplicity we give `column_labels` the name `WORLD`, just like before. + +It's also important to note that `$(SPECROOT)` and `$(MERLIN_INFO)` are [Reserved Variables](../user_guide/variables.md#user-variables). The `$(SPECROOT)` variable is a shorthand for the directory path of the spec file and the `$(MERLIN_INFO)` variable is a shorthand for the directory holding the provenance specs and sample generation results. More information on Merlin variables can be found on the [Variables](../user_guide/variables.md) page. + +### The `make_samples.py` Script + +In the [Get Example Files](#get-example-files) section above we mentioned the `make_samples.py` file. It's good practice to shift larger chunks of code to external scripts and that's exactly what this file is doing for us. This file will handle our sample generation by randomly selecting names using 2 external python libraries: the [Names library](https://pypi.org/project/names/) and the [NumPy library](https://numpy.org/). Let's make sure those libraries are installed now: + +```bash +pip3 install -r requirements.txt +``` + +The `make_samples.py` file should be kept at the same location as your spec file and its contents should look like so: + + +[make_samples.py](../../merlin/examples/workflows/hello/make_samples.py) + + +Since our environment variable `N_SAMPLES` is set to 3, the sample-generating command that calls this script in our `merlin` block should churn out 3 different names. + +### Running With Samples + +Before we run our study, let's take a look at our DAG now that we've added samples: + +
+ ![DAG With Samples](../assets/images/tutorial/hello_world/dag4.png) +
DAG With Samples
+
+ +Every sample that's generated in Merlin will run for each parameter set. So, since we have one parameter `GREET` with two values `hello` and `hola` (two parameter sets), and three sample names, we'll get six different runs of `step_1`. + +With the modifications to the `global.parameters` block and the additions of the `env` and `merlin` blocks, your new and improved `my_hello.yaml` should now match `hello_samples.yaml`: + +???+ abstract "Full Hello Samples Spec" + + + [hello_samples.yaml](../../merlin/examples/workflows/hello/hello_samples.yaml) + + +Run the workflow again! + +Once finished, this is what the insides of `step_1` look like: + +
+ ![Successful Step 1 With Samples](../assets/images/tutorial/hello_world/merlin_output2.png) +
Successful Step 1 With Samples
+
+ +Numerically-named directories like `00`, `01`, and `02` are sample directories. Instead of storing sample output in a single flattened location, Merlin stores them in a tree-like sample index, which helps get around file system constraints when working with massive amounts of data. + +Lastly, let's flex Merlin's muscle a bit and scale up our workflow to 1000 samples. To do this, you could internally change the value of `N_SAMPLES` in the spec from 3 to 1000. OR you could modify the value at the command line like so: + +```bash +merlin run my_hello.yaml --vars N_SAMPLES=1000 +``` + +Don't forget to start your workers if they're not still running: + +```bash +merlin run-workers my_hello.yaml +``` + +Once again, to send a warm stop signal to your workers, run: + +```bash +merlin stop-workers +``` + +Congratulations! You concurrently greeted 1000 friends in English and Spanish! diff --git a/docs/tutorial/4_run_simulation.md b/docs/tutorial/4_run_simulation.md new file mode 100644 index 000000000..7e27c746b --- /dev/null +++ b/docs/tutorial/4_run_simulation.md @@ -0,0 +1,331 @@ +# Run a Real Simulation + +!!! abstract "Summary" + + This module aims to do a parameter study on a well-known benchmark problem for viscous incompressible fluid flow. + +!!! info "Prerequisites" + + - [0. Before You Start](./0_prerequisites.md) + - [2. Installation](./2_installation.md) + - [3. Hello, World!](./3_hello_world.md) + +!!! info "Estimated Time" + + 60 minutes + +!!! abstract "You Will learn" + + - How to run the simulation OpenFOAM, using Merlin. + - How to use machine learning on OpenFOAM results, using Merlin. + +## Introduction + +We aim to do a parameter study on the lid-driven cavity problem. + ++--------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ +|
+
| +| ![Fig 1. Lid-driven cavity problem setup](../assets/images/tutorial/run_simulation/setup.png) + ![Fig 2. Example of a flow in steady state](../assets/images/tutorial/run_simulation/lid-driven-stable.png) | +|
Fig 1. Lid-driven cavity problem setup
+
Fig 2. Example of a flow in steady state
| +|
+
| ++--------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------+ + +In this problem, we have a viscous fluid within a square cavity that has three non-slip walls and one moving wall (moving lid). We are interested in learning how varying the viscosity and lid speed affects the average enstrophy and kinetic energy of the fluid after it reaches steady state. We will be using the velocity squared as a proxy for kinetic energy. + +This module will be going over: + + - Setting up our inputs using the `merlin` block + - Running multiple simulations in parallel + - Combining the outputs of these simulations into a an array + - Predictive modeling and visualization + +### Before Moving On + +Check that the virtual environment with Merlin installed is activated and that your configuration shows no errors: + +```bash +merlin info +``` + +This is covered more in depth in the [Verifying Installation](./2_installation.md#checkingverifying-installation) section of the Installation module and at the [Configuration](../user_guide/configuration/index.md) page. + +There are two ways to do this example: with docker and without docker. To go through the version with docker, get the necessary files for this module by running: + +```bash +merlin example openfoam_wf ; cd openfoam_wf/ +``` + +For the version without docker you should run: + +```bash +merlin example openfoam_wf_no_docker ; cd openfoam_wf_no_docker/ +``` + +!!! note + + From here on, this tutorial will focus solely on the docker version of running OpenFOAM. However, the docker version of this tutorial is almost identical to the no docker version. If you're using the no docker version of this tutorial you can still follow along but check the openfoam_no_docker_template.yaml file in each step to see what differs. + +In the `openfoam_wf` directory you should see the following: + +
+ ![Fig 3. openfoam_wf Directory Structure](../assets/images/tutorial/run_simulation/openfoam_wf_output.png) +
Fig 3. openfoam_wf Directory Structure
+
+ +- `openfoam_wf.yaml` -- this spec file is partially blank. You will fill in the gaps as you follow this module's steps. +- `openfoam_wf_template.yaml` -- this is a complete spec file. You can always reference it as an example. +- `scripts` -- This directory contains all the necessary scripts for this module. + - We'll be exploring these scripts as we go with the tutorial. +- `requirements.txt` -- this is a text file listing this workflow's python dependencies. + +## Specification File + +We are going to build a spec file that produces this DAG: + +
+ ![Fig 4. OpenFOAM DAG](../assets/images/tutorial/run_simulation/openfoam_dag.png) +
Fig 4. OpenFOAM DAG
+
+ +**To start, open** `openfoam_wf.yaml` **using your favorite text editor.** + +It should look something like this: + +???+ abstract "Initial Contents of the Spec" + + + [openfoam_wf.yaml](../../merlin/examples/workflows/openfoam_wf/openfoam_wf.yaml) + + +### Variables + +First we specify some variables to make our life easier. Locate the `env` block in our yaml spec: + + +[](../../merlin/examples/workflows/openfoam_wf/openfoam_wf.yaml) lines:9-14 + + +The `OUTPUT_PATH` variable is set to tell Merlin where you want your output directory to be written. The default is the current working directory. + +We'll fill out the next two variables as we go. + +### Samples and Scripts + +One Merlin best practice is to copy any scripts your workflow may use from your `SPECROOT` directory into the `MERLIN_INFO` directory. This is done to preserve the original scripts in case they are modified during the time Merlin is running. We will do that first. We'll put this in the Merlin sample generation section, since it runs before anything else. + +Edit the `samples` section of the `merlin` block to look like the following: + + +```yaml +merlin: + samples: + generate: + cmd: | + cp -r $(SPECROOT)/scripts $(MERLIN_INFO)/ + + # Generates the samples + python $(MERLIN_INFO)/scripts/make_samples.py -n 10 -outfile=$(MERLIN_INFO)/samples + file: $(MERLIN_INFO)/samples.npy + column_labels: [LID_SPEED, VISCOSITY] +``` + +We will be using the scripts directory a lot so we'll set the variable `SCRIPTS` to `$(MERLIN_INFO)/scripts` for convenience. We would also like to have a more central control over the number of samples generated so we'll create an `N_SAMPLES` variable: + + +[](../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml) lines:9-14 + + +and update the `samples` section of the `merlin` block to be: + + +[](../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml) lines:17-26 + + +Just like in the [Using Samples](./3_hello_world.md#using-samples) step of the hello world module, we generate samples using the `merlin` block. We are only concerned with how the variation of two initial conditions, lidspeed and viscosity, affects outputs of the system. These are the `column_labels`. The `make_samples.py` script is designed to make log uniform random samples. Now, we can move on to the steps of our study block. + +### Setting Up + +Our first step in our study block is concerned with making sure we have all the required python packages for this workflow. The specific packages are found in the `requirements.txt` file. + +We will also need to copy the lid driven cavity deck from the OpenFOAM docker container and adjust the write controls. This last part is scripted already for convenience. + +Locate the `setup` step in the study block and edit it to look like the following: + + +[](../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml) lines:37-47 + + +This step does not need to be parallelized so we will assign it to lower concurrency (a setting that controls how many workers can be running at the same time on a single node). + +Locate the `resources` section in the `merlin` block, then edit the concurrency and add the setup step: + +```yaml +resources: + workers: + nonsimworkers: + args: -l INFO --concurrency 1 + steps: [setup] +``` + +The `resources` section of the `merlin` block is where you can control the behavior of your workers. Here we're defining one worker named `nonsimworkers` and providing it with some arguments. The `-l INFO` option sets its log level to be `INFO` (the standard log level) and the `--concurrency 1` option means that only one of these workers can spin up per node. + +!!! warning + + If the `--concurrency` option is omitted, the Celery library will default to using however many cores are on the node (see [Celery's docs](https://docs.celeryq.dev/en/stable/userguide/workers.html#concurrency) for more information). + +In addition to providing arguments to workers, we're also telling it to manage the tasks that will be produced by the `setup` step. What this *actually* means is that this worker will watch the `task_queue` associated with the `setup` step. Since we didn't provide a `task_queue` value for this step, the `task_queue` value will default to be `merlin` (which you'll be able to see in the `openfoam_wf.expanded.yaml` spec in the `merlin_info/` directory after we run this). + +### Running the Simulation + +Moving on to the `sim_runs` step, we want to: + +1. Copy the cavity deck from the `MERLIN_INFO` directory into each of the current step's subdirectories +2. Edit the default input values (lidspeed and viscosity) in these cavity decks using the `sed` command +3. Run the simulation using the `run_openfoam` executable through the OpenFOAM docker container +4. Post-process the results (also using the `run_openfoam` executable) + +This part should look like: + + +[](../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml) lines:49-71 + + +This step runs many simulations in parallel so it would run faster if we assign it a worker with a higher concurrency. Navigate back to the `resources` section in the `merlin` block + +```yaml +resources: + workers: + nonsimworkers: + args: -l INFO --concurrency 1 + steps: [setup] + simworkers: + args: -l INFO --concurrency 10 --prefetch-multiplier 1 -Ofair + steps: [sim_runs] +``` + +Since we defined a `task_queue` value in the `sim_runs` step, when we tell `simworkers` to watch the `sim_runs` step they will really be watching the `simqueue` queue. + +The quantities of interest are the average enstrophy and kinetic energy at each cell. The enstrophy is calculated through an OpenFOAM post processing function of the the flow fields while the kinetic energy is approximated by using the square of the velocity vector at each grid point. The velocity field is normally outputted normally as a result of running the default solver for this particular problem. + +The `run_openfoam` executable calculates the appropriate timestep `deltaT` so that we have a Courant number of less than 1. It also uses the `icoFoam` solver on the cavity decks and gives us VTK files that are helpful for visualizing the flow fields using visualization tools such as [VisIt](https://visit-dav.github.io/visit-website/index.html) or [ParaView](https://www.paraview.org/). + +### Combining Outputs + +Navigate to the next step in our `study` block `combine_outputs`. The purpose of this step is to extract the data from each of the simulation runs from the previous step (`sim_runs`) and combine them for future use. + +The `combine_outputs.py` script in the `$(SCRIPTS)` directory is provided for convenience and takes two inputs. The first informs it of the base directory of the `sim_runs` directory and the second specifies the subdirectories for each run. The script then goes into each of the directories and combines the velocity and enstrophy for each timestep of each run in a .npz file. + + +[](../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml) lines:73-78 + + +The `$(MERLIN_PATHS_ALL)` variable is a [Reserved Variable](../user_guide/variables.md#reserved-variables) that denotes a space delimited string of all of the sample paths. + +This step depends on all the previous step's simulation runs which is why we have the `_*`. However, it does not need to be parallelized so we assign it to the `nonsimworkers` in the `workers` section of the `merlin` block. + +```yaml +workers: + nonsimworkers: + args: -l INFO --concurrency 1 + steps: [setup, combine_outputs] +``` + +### Machine Learning and Visualization + +In the `learn` step, we want to: + + 1. Post-process the .npz file from the previous step. + 2. Learn the mapping between our inputs and chosen outputs + 3. Graph important features + +The provided `learn.py` script does all of the above. It outputs the trained sklearn model and a png of the graphs plotted in the current directory. + + +[](../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml) lines:80-85 + + +This step is also dependent on the previous step for the .npz file and will only need one worker therefore we will assign it to `nonsimworkers`: + +```yaml +nonsimworkers: + args: -l INFO --concurrency 1 + steps: [setup, combine_outputs, learn] +``` + +### Putting It All Together + +By the end, your `openfoam_wf.yaml` should look like the template version in the same directory: + +???+ abstract "Complete Spec File" + + + [openfoam_wf_template.yaml](../../merlin/examples/workflows/openfoam_wf/openfoam_wf_template.yaml) + + +## Run the workflow + +Now that you are done with the Specification file, use the following commands from inside the `openfoam_wf` directory to run the workflow on our task server. + +!!! note + + Running with fewer samples is the one of the best ways to debug + +Create the DAG and send tasks to the server with: + +```bash +merlin run openfoam_wf.yaml +``` + +Open a new terminal window, then start the workers that will consume the tasks we just queued by using: + +```bash +merlin run-workers openfoam_wf.yaml +``` + +But wait! We realize that 10 samples is not enough to train a good model. We would like to restart with 100 samples instead of 10 (should take about 6 minutes): + +After sending the workers to start on their queues, we need to first stop the workers: + +```bash +merlin stop-workers --spec openfoam_wf.yaml +``` + +!!! tip + + Using the `--spec` option with the `merlin stop-workers` command will tell Merlin to only stop workers from a specific YAML spec + +We stopped these tasks from running but if we were to run the workflow again (with 100 samples instead of 10), we would continue running the 10 samples first! This is because the queues are still filled with the previous attempt's tasks. This can be seen with: + +```bash +merlin status openfoam_wf.yaml +``` + +We need to purge these queues first in order to repopulate them with the appropriate tasks. This is where we use the `merlin purge` command: + +```bash +merlin purge openfoam_wf.yaml +``` + +Now we are free to repopulate the queues with the 100 samples. In our terminal window that's not designated for our workers, we'll queue up tasks again, this time with 100 samples: + +```bash +merlin run openfoam_wf.yaml --vars N_SAMPLES=100 +``` + +Then in our window for workers, we'll execute: + +```bash +merlin run-workers openfoam_wf.yaml +``` + +To see your results, look inside the `learn` output directory. You should see a png that looks like this: + +
+ ![Fig 5. Output](../assets/images/tutorial/run_simulation/prediction.png) +
Fig 5. Output
+
+ +!!! info "Related Articles" + + - [OpenFOAM v6 User Guide - 2.1 Lid-driven cavity flow](https://cfd.direct/openfoam/user-guide/v6-cavity/) + - [The Complete Guide to Docker & OpenFOAM](https://www.cfdengine.com/blog/how-to-install-openfoam-anywhere-with-docker/) diff --git a/docs/tutorial/5_advanced_topics.md b/docs/tutorial/5_advanced_topics.md new file mode 100644 index 000000000..93d606322 --- /dev/null +++ b/docs/tutorial/5_advanced_topics.md @@ -0,0 +1,430 @@ +# Advanced Topics + +!!! info "Prerequisites" + + - [0. Before You Start](./0_prerequisites.md) + - [2. Installation](./2_installation.md) + - [3. Hello, World!](./3_hello_world.md) + - [4. Run a Real Simulation](./4_run_simulation.md) + - Python virtual environment containing the following packages + - merlin + - pandas + - faker + +!!! info "Estimated Time" + + 15 minutes + +!!! abstract "You Will Learn" + + - Run workflows using HPC batch schedulers + - Distribute workflows across multiple batch allocations and machines + - Setup iterative workflow specs suited for optimization and dynamic sampling applications + + + + +## Setup + +The code for the following examples can be obtained from command line, invoking: + +```bash +merlin example hpc_demo +``` + +This will copy the three Merlin workflow specifications from this section and the supporting python scripts. Each specification may need some modification to adapt it to the batch scheduler you will be using. In addition, the dynamic sampling workflow will need an additional modification to set the path of the virtual environment, which is set as a variable in the `env` block. + +## Interfacing with HPC Systems + +Another block is added to the Merlin workflow specification when running on HPC systems—the batch block. This block contains information about the batch scheduler system, such as batch type, batch queue to use, and banks to charge. There are additional optional arguments for addressing any special configurations or launch command arguments, varying based on batch type. In addition, the shell type used by each step's cmd scripts can be specified here. While the number of nodes in a batch allocation can be defined here, it will be overridden in the worker config. + +```yaml +batch: + # Required keys: + type: flux + bank: testbank + queue: pbatch + + # Optional keys: + flux_path: + flux_start_opts: + flux_exec_workers: + + launch_pre: s + launch_args: + worker_launch: + + shell: + # e.g. /bin/bash, /bin/tcsh, python, /usr/bin/env perl, etc. + nodes: # The number of nodes to use for all workers + This can be overridden in the workers config. + If this is unset the number of nodes will be + queried from the environment, failing that, the + number of nodes will be set to 1. +``` + +Inside the study step specifications are a few additional keys that become more useful on HPC systems: `nodes`, `procs`, and `task_queue`. Adding on the actual study steps to the above batch block specifies the actual resources each steps processes will take. + +```yaml +study: + - name: sim-runs + description: Run simulations + run: + cmd: $(LAUNCHER) echo "$(VAR1) $(VAR2)" > simrun.out + nodes: 4 + procs: 144 + task_queue: sim_queue + + - name: post-process + description: Post-Process simulations on second allocation + run: + cmd: | + cd $(runs1.workspace)/$(MERLIN_SAMPLE_PATH) + $(LAUNCHER) + nodes: 1 + procs: 36 + depends: [sim-runs] + task_queue: post_proc_queue +``` + +In addition to the `batch` block is the `resources` section inside the `merlin` block. This can be used to put together custom celery workers. Here you can override batch types and node counts on a per worker basis to accommodate steps with different resource requirements. In addition, this is where the `task_queue` becomes useful, as it groups the different allocation types, which can be assigned to each worker here by specifying step names. + +```yaml +merlin: + + resources: + task_server: celery + + # Flag to determine if multiple workers can pull tasks + # from overlapping queues. (default = False) + overlap: False + + # Customize workers. Workers can have any user-defined name + # (e.g., simworkers, learnworkers, ...) + workers: + simworkers: + args: # + steps: [sim-runs] # [all] if none specified + nodes: 4 # optional + machines: [host1] # +``` + +Arguments to celery itself can also be defined here with the `args` key. Of particular interest will be: + ++-------------------------+---------------+ +| `--concurrency` + num_threads | ++-------------------------+---------------+ +| `--prefetch-multiplier` + num_tasks | ++-------------------------+---------------+ +| `-0 fair` + | ++-------------------------+---------------+ + +Concurrency can be used to run multiple workers in an allocation, thus is recommended to be set to the number of simulations or step work items that fit into the number of nodes in the batch allocation in which these workers are spawned. + +!!! note + + Some schedulers, such as `flux`, can support more jobs than the node has resources for. This may not impact the throughput, but it can prevent over-subscription errors that might otherwise stop the workflow. + +The prefetch multiplier is more related to packing in tasks into the time of the allocation. For long running tasks it is recommended to set this to 1. For short running tasks, this can reduce overhead from talking to the rabbit servers by requesting ` x ` tasks at a time from the server. + +The `-0 fair` option enables workers running tasks from different queues to run on the same allocation. + +The example block below extends the previous with workers configured for long running simulation jobs as well as shorter running post processing tasks that can cohabit an allocation: + +```yaml +merlin: + + resources: + task_server: celery + + overlap: False + + # Customize workers + workers: + simworkers: + args: --concurrency 1 + steps: [sim-runs] + nodes: 4 + machines: [host1] + + postworkers: + args: --concurrency 4 --prefetch-multiplier 2 + steps: [post-proc-runs] + nodes: 1 + machines: [host1] +``` + +Putting it all together and adding some variables/samples we have an HPC batch enabled study specification. In this demo workflow, `sample_names` generates many single core jobs, with concurrency set to 36 for this particular machine that has 36 cores per node. The `collect` step on the other hand consists of a single job that uses all cores on the node, and is assigned to a queue that has a concurrency of 1. + + +???+ abstract "HPC Batch Enabled Study Spec" + + ```yaml + description: + name: hpc_demo + description: Demo running a workflow on HPC machines + + env: + variables: + OUTPUT_PATH: ./name_studies + + # Collect individual sample files into one for further processing + COLLECT: $(SPECROOT)/sample_collector.py + + # Process single iterations' results + POST_PROC: $(SPECROOT)/sample_processor.py + + # Process all iterations + CUM_POST_PROC: $(SPECROOT)/cumulative_sample_processor.py + + # Number of threads for post proc scripts + POST_NPROCS: 36 + PYTHON: + + batch: + type: flux + bank: testbank + queue: pdebug + shell: /bin/bash + nodes: 1 + + ######################################## + # Study definition + ######################################## + study: + - name: sample_names + description: Record samples from the random name generator + run: + cmd: | + $(LAUNCHER) echo "$(NAME)" + $(LAUNCHER) echo "$(NAME)" > name_sample.out + nodes: 1 + procs: 1 + task_queue: name_queue + + - name: collect + description: Collect all samples generated + run: + cmd: | + echo $(MERLIN_GLOB_PATH) + echo $(sample_names.workspace) + + ls $(sample_names.workspace)/$(MERLIN_GLOB_PATH)/name_sample.out | xargs $(PYTHON) $(COLLECT) -out collected_samples.txt --np $(POST_NPROCS) + + nodes: 1 + procs: 1 + depends: [sample_names_*] + task_queue: post_proc_queue + + - name: post-process + description: Post-Process collection of samples, counting occurrences of unique names + run: + cmd: | + $(PYTHON) $(POST_PROC) $(collect.workspace)/collected_samples.txt --results iter_$(ITER)_results.json + + nodes: 1 + procs: 1 + depends: [collect] + task_queue: post_proc_queue + + ######################################## + # Worker and sample configuration + ######################################## + merlin: + + resources: + task_server: celery + + overlap: False + + workers: + nameworkers: + args: --concurrency 36 --prefetch-multiplier 3 + steps: [sample_names] + nodes: 1 + machines: [quartz] + + postworkers: + args: --concurrency 1 --prefetch-multiplier 1 + steps: [post-process] + nodes: 1 + machines: [quartz] + + ################################################### + samples: + column_labels: [NAME] + file: $(MERLIN_INFO)/samples.csv + generate: + cmd: | + $(PYTHON) $(SPECROOT)/faker_sample.py -n 200 -outfile=$(MERLIN_INFO)/samples.csv + ``` + +## Multi-Machine Workflows + +Spreading this workflow across multiple machines requires a small modification of the above workflow: simply add additional host names to the `machines` list in the worker config. A caveat for this feature is that all host systems will need to have access to the same workspace/filesystem. + +The following resource block demonstrates usage of one host for larger simulation steps, and a second host for the smaller post processing steps. In this case you simply need to get an allocation on each host, then execute `run-workers` on each. The `run` command only needs to be run one time from any host to send the tasks to the central task queue server. + +```yaml +######################################## +# Worker and sample configuration +######################################## +merlin: + + resources: + task_server: celery + + overlap: False + + # Customize workers + workers: + simworkers: + args: --concurrency 1 + steps: [sim-runs] + nodes: 4 + machines: [host1] + + postworkers: + args: --concurrency 4 --prefetch-multiplier 2 + steps: [post-proc-runs] + nodes: 1 + machines: [host2] +``` + +## Dynamic Task Queuing and Sampling + +Iterative workflows, such as optimization or machine learning, can be implemented in Merlin via recursive workflow specifications that use dynamic task queuing. The example spec below is a simple implementation of this using an iteration counter `$(ITER)` and a predetermined limit `$(MAX_ITER)` to limit the number of times to generate new samples and spawn a new instantiation of the workflow. The iteration counter takes advantage of the ability to override workflow variables on the command line. + +???+ abstract "Iterative Spec" + + ```yaml + description: + name: dynamic_sampling_demo + description: Demo dynamic sampling workflow + + env: + variables: + OUTPUT_PATH: ./name_studies + ITER_OUTPUT: $(SPECROOT)/$(OUTPUT_PATH)/iter_outputs # Iteration and cumulative results + COLLECT: $(SPECROOT)/sample_collector.py + POST_PROC: $(SPECROOT)/sample_processor.py # Process single iterations' results + CUM_POST_PROC: $(SPECROOT)/cumulative_sample_processor.py # Process all iterations + POST_NPROCS: 36 # Number of threads for post proc scripts + PYTHON: + ITER: 1 + MAX_ITER: 10 + + batch: + type: flux + bank: testbank + queue: pdebug + shell: /bin/bash + nodes: 1 + + ######################################## + # Study definition + ######################################## + study: + - name: sample_names + description: Record samples from the random name generator + run: + cmd: | + $(LAUNCHER) echo "$(NAME)" + $(LAUNCHER) echo "$(NAME)" > name_sample.out + nodes: 1 + procs: 1 + task_queue: name_queue + + - name: collect + description: Collect all samples generated + run: + cmd: | + echo $(MERLIN_GLOB_PATH) + echo $(sample_names.workspace) + + ls $(sample_names.workspace)/$(MERLIN_GLOB_PATH)/name_sample.out | xargs $(PYTHON) $(COLLECT) -out collected_samples.txt --np $(POST_NPROCS) + + nodes: 1 + procs: 1 + depends: [sample_names_*] + task_queue: post_proc_queue + + - name: post-process + description: Post-Process collection of samples, counting occurrences of unique names + run: + cmd: | + $(PYTHON) $(POST_PROC) $(collect.workspace)/collected_samples.txt --results $(ITER_OUTPUT)/iter_$(ITER)_results.json + + nodes: 1 + procs: 1 + depends: [collect] + task_queue: post_proc_queue + + - name: run-more-samples + description: Generate new set of samples and rerun, or generate some descriptive plots/statistics + run: + cmd: | + if [ $(ITER) -ge $(MAX_ITER) ] ; then + echo "done" + $(PYTHON) $(CUM_POST_PROC) $(ITER_OUTPUT)/iter_*_results.json --np $(POST_NPROCS) --hardcopy $(ITER_OUTPUT)/cumulative_results.png + else + next_iter=$(ITER) + ((next_iter=next_iter+1)) + echo "Starting iteration " $next_iter + cd $(SPECROOT) + merlin run $(SPECROOT)/faker_demo.yaml --vars ITER=$next_iter + fi + nodes: 1 + procs: 1 + depends: [post-process] + task_queue: post_proc_queue + + ######################################## + # Worker and sample configuration + ######################################## + merlin: + + resources: + task_server: celery + + overlap: False + + # Customize workers NOTE: abuse this for scaling study: prefetch mult increase + # - celery->rabbit query overhead for fast jobs + workers: + nameworkers: + args: --concurrency 36 --prefetch-multiplier 3 + steps: [sample_names] + nodes: 1 + machines: [borax, quartz] + + # NOTE: specifying wrong step leaves orphaned queue -> purge first! + # also, invalid host name appears to fail silently + postworkers: + args: --concurrency 1 --prefetch-multiplier 1 + steps: [post-process] + nodes: 1 + machines: [borax, quartz] + + ################################################### + samples: + column_labels: [NAME] + file: $(MERLIN_INFO)/samples.csv + generate: + cmd: | + $(PYTHON) $(SPECROOT)/faker_sample.py -n 200 -outfile=$(MERLIN_INFO)/samples.csv + ``` + +This workflow specification is intended to be invoked within an allocation of nodes on your HPC cluster, e.g. within and sxterm. The last step is to queue up new samples for the next iteration, `merlin run faker_demo.yaml ...`. Notice how we don't need another `run-workers` call since the workers from the first instantiation are still alive. Thus the new samples will immediately start processing on the existing allocation. + +Another change in this workflow relative to the single stage version is managing the workspaces and outputs. The strategy used here is to create a new directory for collecting each iterations final outputs, `$(ITER_OUTPUT)`, facilitating collective post processing at the end without having to worry about traversing into each iterations' local workspaces. + +The workflow itself isn't doing anything practical; it's simply repeatedly sampling from a fake name generator in an attempt to count the number of unique names that are possible. The figure below shows results from running 20 iterations, with the number of unique names faker can generate appearing to be slightly more than 300. + +
+ ![Unique Name Results](../assets/images/tutorial/advanced_topics/cumulative_results.png) +
Unique Name Results
+
+ + diff --git a/docs/tutorial/6_contribute.md b/docs/tutorial/6_contribute.md new file mode 100644 index 000000000..245486745 --- /dev/null +++ b/docs/tutorial/6_contribute.md @@ -0,0 +1,112 @@ +# Contribute to Merlin + +!!! info "Estimated Time" + + 10 minutes + +!!! abstract "You Will Learn" + + - How to post issues to the Merlin repo. + - Guidelines for contributing to Merlin. + +## Issues + +Found a bug? Have an idea? Or just want to ask a question? [Create a new issue](https://github.com/LLNL/merlin/issues/new/choose) on GitHub. + +### Bug Reports + +To report a bug, simply navigate to [Issues](https://github.com/LLNL/merlin/issues), click "New Issue", then click "Bug report". Then simply fill out a few fields such as "Describe the bug" and "Expected behavior". Try to fill out every field as it will help us figure out your bug sooner. + +### Feature Requests + +We are still adding new features to Merlin. To suggest one, simply navigate to [Issues](https://github.com/LLNL/merlin/issues), click "New Issue", then click "Feature request". Then fill out a few fields such as "What problem is this feature looking to solve?" + +### Questions + +!!! note + + Who knows? Your question may already be answered in the [FAQ](../faq.md). + +We encourage questions to be asked in a collaborative setting: on GitHub, direct any questions to [General Questions](https://github.com/LLNL/merlin/issues/new?labels=question&template=question.md&title=%5BQ%2FA%5D+) in Issues. + +For more ways to reach out with questions, see the [Contact](../contact.md) page. + +## Contributing + +Merlin is an open source project, so contributions are welcome. Contributions can be anything from bugfixes, documentation, or even new core features. + +Merlin uses a rough approximation of the Git Flow branching model. The `develop` branch contains the latest contributions, and `main` is always tagged and points to the latest stable release. + +If you're a contributor, try to test and run by branching off of `develop`. That's where all the magic is happening (and where we hope bugs stop). + +More detailed information on contributing can be found on the [Contributing](../user_guide/contributing.md). + +### How to Contribute + +Contributing to Merlin is as easy as following these steps: + +1. [Fork the Merlin repository](https://github.com/LLNL/merlin/fork) + +2. [Clone your forked repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) + +3. Move into the `merlin/` directory: + + ```bash + cd merlin/ + ``` + +4. Ensure you're on the `develop` branch by running: + + ```bash + git checkout develop + ``` + +5. Create a new virtual environment and install an editable version of Merlin with: + + ```bash + make install-dev + ``` + + !!! note + + The name of this virtual environment will be `venv_merlin_py_x_y` where `x` and `y` represent the Python version that was used to create the virtual environment. For example, if you had Python 3.10.8 loaded when you run this command, you'll get a virtual environment named `venv_merlin_py_3_10`. + +6. Activate the environment that was just created with the below command. You'll have to modify `x` and `y` here to match the python version that your virtual environment was created with. + + === "bash" + + ```bash + source venv_merlin_py_x_y/bin/activate + ``` + + === "csh" + + ```csh + source venv_merlin_py_x_y/bin/activate.csh + ``` + +7. Create a new branch for your changes. Typically branch names will start with one of the following prefixes: `feature`, `bugfix`, `refactor`, or `docs`. The following command will create a new branch for you and switch to it: + + ```bash + git switch -c / + ``` + +8. Create your changes + +9. Once you've made all of your changes, run the following command from the root of the repository to ensure the style of your code matches Merlin's standard: + + ```bash + make fix-style + ``` + +10. Ensure all of the tests pass: + + ```bash + make tests + ``` + +11. Summarize your changes in the `[Unreleased]` section in the `CHANGELOG.md` file + +12. [Send us a pull request](https://github.com/LLNL/merlin/pulls) from your fork. Make sure you're requesting a pull request from your branch to the `develop` branch on Merlin's home repository. + +Happy contributing! diff --git a/docs/tutorial/7_port_application.md b/docs/tutorial/7_port_application.md new file mode 100644 index 000000000..64796b6b5 --- /dev/null +++ b/docs/tutorial/7_port_application.md @@ -0,0 +1,57 @@ +# Port Your Own Application + +!!! info "Prerequisites" + + - [2. Installation](./2_installation.md) + - [3. Hello, World!](./3_hello_world.md) + - [4. Run a Real Simulation](./4_run_simulation.md) + +!!! info "Estimated Time" + + 15 minutes + +!!! abstract "You Will Learn" + + - Tips for building workflows + - Tips for scaling + - Debugging + +## Tips For Porting Your App, Building Workflows + +The first step of building a new workflow, or porting an existing app to a workflow, is to describe it as a set of discrete, and ideally focused steps. Decoupling the steps and making them generic when possible will facilitate more rapid composition of future workflows. This will also require mapping out the dependencies and parameters that get passed between/shared across these steps. + +Setting up a template using tools such as [cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) can be useful for more production style workflows that will be frequently reused. Additionally, make use of the built-in examples accessible from the Merlin command line with `merlin example`. + +Use dry runs `merlin run --dry --local` to prototype without actually populating task broker's queues. Similarly, once the dry run prototype looks good, try it on a small number of parameters before throwing millions at it. + +Merlin inherits much of the input language and workflow specification philosophy from [Maestro](https://maestrowf.readthedocs.io/en/latest/). Thus a good first step is to learn to use that tool. + +Make use of [exit keys](../user_guide/variables.md#step-return-variables) such as `MERLIN_RESTART` or `MERLIN_RETRY` in your step logic. + +## Tips For Debugging Your Workflows + +The scripts defined in the workflow steps are also written to the output directories; this is a useful debugging tool as it can both catch parameter and variable replacement errors, as well as provide a quick way to reproduce, edit, and retry the step offline before fixing the step in the workflow specification. The `.out` and `.err` files log all of the output to catch any runtime errors. + +If you launch your workers using a bash script with an output file defined, or provide the `--logfile` argument to your workers, the logs from your workers will be sent to a log file. These logs will show the execution logs for every step in your workflow that the celery workers process. If you're having issues with your workflow, you may need to grep for `WARNING` and `ERROR` in the worker logs to identify the problem. + +When a bug crops up in a running study with many parameters, there are a few other commands to make use of. Rather than trying to spam `Ctrl-c` to kill all the workers, you will want to instead use Merlin's built-in command to stop the workers for that workflow: + +```bash +merlin stop-workers --spec .yaml +``` + +This should then be followed up with Merlin's built-in command to clear out the task queue: + +```bash +merlin purge .yaml +``` + +This will prevent the same buggy tasks from continuing to run the next time `run-workers` is invoked. + +## Tips For Scaling Workflows + +Most of the worst bottlenecks you'll encounter when scaling up your workflow are caused by the file system. This can be caused by using too much space or too many files, even in a single workflow if you're not careful. There is a certain number of inodes created just based upon the sample counts even without accounting for the steps being executed. This can be mitigated by avoiding reading/writing to the file system when possible. If file creation is unavoidable, you may need to consider adding cleanup steps to your workflow: dynamically pack up the previous step in a tarball, transfer to another file system or archival system, or even just delete files. + +## Misc Tips + +Avoid reliance upon storage at the `$(SPECROOT)` level. This is particularly dangerous if using symlinks as it can violate the provenance of what was run, possibly ruining the utility of the dataset that was generated. It is preferred to make local copies of any input decks and supporting scripts and data sets inside the workflows' workspace. This of course has limits, regarding shared/system libraries that any programs running in the steps may need; alternate means of recording this information in a log file or something similar may be needed in this case. diff --git a/docs/source/tutorial.rst b/docs/tutorial/index.md similarity index 51% rename from docs/source/tutorial.rst rename to docs/tutorial/index.md index 0b69f9553..f762e88b2 100644 --- a/docs/source/tutorial.rst +++ b/docs/tutorial/index.md @@ -1,9 +1,8 @@ -Tutorial -======== +# Tutorial -.. admonition:: Estimated time +!!! info "Estimated Time" - * 3 hours + 3 hours Grab your laptop and coffee, and dive into this 7-module tutorial to become a Merlin expert. @@ -14,25 +13,4 @@ quasi-real-life physicsy simulation that couples a physics application with visualization and machine learning. You'll also learn how to use some advanced features and help make Merlin better. -Finally we offer some tips and tricks for porting and scaling up your application. - -.. :doc:`0. Prerequisites` - -.. toctree:: - :maxdepth: 1 - :caption: Before you begin: - - modules/before - -.. toctree:: - :maxdepth: 1 - :numbered: - :caption: Tutorial modules: - - modules/introduction - modules/installation/installation - modules/hello_world/hello_world - modules/run_simulation/run_simulation - modules/advanced_topics/advanced_topics - modules/contribute - modules/port_your_application +Finally we offer some tips and tricks for porting and scaling up your application. \ No newline at end of file diff --git a/docs/user_guide/celery.md b/docs/user_guide/celery.md new file mode 100644 index 000000000..cc2d67953 --- /dev/null +++ b/docs/user_guide/celery.md @@ -0,0 +1,205 @@ +# Celery With Merlin + +Merlin uses [Celery](http://www.celeryproject.org), a Python based distributed task management system. Merlin uses Celery to queue work which is processed by Celery workers. + +Merlin queues tasks to the broker which receives and routes tasks. Merlin by default is configured to use [RabbitMQ](https://www.rabbitmq.com/) as the broker but [Redis](https://redis.io/) can be used as well. + +Celery has many functions, it defines the interface to the task broker, the backend results database and the workers that will run the tasks. + +As discussed in the [Configuration](./configuration/index.md) page, the broker and backend are configured through [the app.yaml file](./configuration/index.md#the-appyaml-file). A configuration for the rabbit AMQP server is shown below. + +???+ abstract "Config File for RabbitMQ Broker and Redis Backend" + + ```yaml title="app.yaml" + celery: + # directory where Merlin looks for the following: + # mysql-ca-cert.pem rabbit-client-cert.pem rabbit-client-key.pem redis.pass + certs: /path/to/celery/config + + broker: + # can be rabbitmq, redis, rediss, redis+sock, amqps, or amqp + name: rabbitmq + #username: # defaults to your username unless changed here + password: ~/.merlin/rabbit-password + # server URL + server: server.domain.com + + ### for rabbitmq connections ### + #vhost: # defaults to your username unless changed here + + ### for redis+sock connections ### + #socketname: the socket name your redis connection can be found on. + #path: The path to the socket. + + ### for redis/rediss connections ### + #port: The port number redis is listening on (default 6379) + #db_num: The data base number to connect to. + + # ssl security + #keyfile: /var/ssl/private/client-key.pem + #certfile: /var/ssl/amqp-server-cert.pem + #ca_certs: /var/ssl/myca.pem + # This is optional and can be required, optional or none + # (required is the default) + #cert_reqs: required + + + results_backend: + # Can be redis,rediss, mysql, db+ or memcached server + # Only a few of these are directly configured by merlin + name: redis + + dbname: dbname + username: username + # name of file where redis password is stored. + password: redis.pass + server: server.domain.com + # merlin will generate this key if it does not exist yet, + # and will use it to encrypt all data over the wire to + # your redis server. + encryption_key: ~/.merlin/encrypt_data_key + port: 6379 + db_num: 0 + + # ssl security + #keyfile: /var/ssl/private/client-key.pem + #certfile: /var/ssl/amqp-server-cert.pem + #ca_certs: /var/ssl/myca.pem + # This is optional and can be required, optional or none + # (required is the default) + #cert_reqs: required + ``` + +## Using Celery Commands With Merlin + +Typically Merlin will handle all interactions with Celery behind the scenes. However, if you'd like to run Celery commands directly you can. + +The Celery command needs application configuration for the specific module that includes Celery, this is specified using the `-A ` syntax. All Celery commands should include the `-A` argument. The correct syntax for interacting with your Merlin module is: + +```bash +celery -A merlin +``` + +The merlin run command will define the tasks from the steps in the yaml file and then send them to the broker through the Celery broker interface. If these tasks are no longer needed or are incorrect, they can be purged by using one of these commands: + +=== "General Purge" + + The following is equivalent to the [`merlin purge`](./command_line.md#purge-merlin-purge) command. + + ```bash + celery -A merlin -Q purge + ``` + + !!! example + + ```bash + celery -A merlin -Q merlin,queue2,queue3 purge + ``` + +=== "RabbitMQ Purge" + + This will *not* work if you're using a broker other than Rabbit AMQP. + + ```bash + celery -A merlin amqp queue.purge + ``` + + !!! example + + ```bash + celery -A merlin amqp queue.purge merlin + ``` + +=== "RabbitMQ Queue Deletion" + + It's recommended to save this as a last resort if purging does not work for some reason. + + ```bash + celery -A merlin amqp queue.delete + ``` + + !!! example + + ```bash + celery -A merlin amqp queue.delete merlin + ``` + +## Configuring Celery Workers + +The common configurations used for the Celery workers in the [Celery Workers Guide](https://docs.celeryproject.org/en/latest/userguide/workers.html) are not the best for HPC applications. Here are some parameters you may want to use for HPC specific workflows. + +These options can be altered by setting the args for an entry of type worker in the `merlin.resources` section of your yaml spec file. + +!!! note + + Merlin uses Celery's [prefork pool](https://celery.school/celery-worker-pools#heading-prefork) so modifying the `--concurrency` value will modify the number of concurrent processes that are started, *not* the number of threads. + + *Celery will set the concurrency to be the number of CPUs on the node by default.* + +The number of processes to use on each node of the HPC allocation is set through the `--concurrency` keyword. A good choice for this is the number of simulations that can be run per node or the number of cores on the machine, whichever is smaller. + +```bash +celery -A merlin worker --concurrency +``` + +!!! example "Concurrency for Simple 1D Short Running Sim" + + If the HPC simulation is a simple 1D short running sim then on Lassen you might want to use all cores on a node: + + ```bash + celery -A merlin worker --concurrency 44 + ``` + +!!! example "Concurrency for Limited Processes" + + If the HPC simulation will take the whole node, you may want to limit this to only a few processes: + + ```bash + celery -A merlin worker --concurrency 2 + ``` + +The `--prefetch-multiplier` argument sets how many tasks are requested from the task server per worker process. If `--concurrency` is 2 and `--prefetch-multiplier` is 3, then 6 tasks will be requested from the task server by the worker processes. Since HPC tasks are generally not short running tasks, the recommendation is to set this to 1. + +```bash +celery -A merlin worker --prefetch-multiplier +``` + +!!! example + + ```bash + celery -A merlin worker --prefetch-multiplier 1 + ``` + +The `-O fair` option is another parameter used for long running Celery tasks. With this set, Celery will only send tasks to processes that are available to run them. + +```bash +celery -A merlin worker -O fair +``` + +The `-n` option allows the workers to be given a unique name so multiple workers running tasks from different queues may share the allocation resources. The names are automatically set to `.%h`, where `` is from the task_queue config or merlin (default) and `%h` will resolve to the hostname of the compute node. + +```bash +celery -A merlin worker -n +``` + +!!! example "Naming the Worker" + + ```bash + celery -A merlin worker -n merlin.%h + ``` + +!!! example "Naming the Worker After a Queue" + + ```bash + celery -A merlin worker -n queue_1.%h + ``` + +On the toss3 nodes, the CPU affinity can be set for the worker processes. This is enabled by setting the environment variable `CELERY_AFFINIITY` to the number of CPUs to skip. + +!!! example + + Setting `CELERY_AFFINITY` to 4 will skip 4 CPUs between each Celery worker process. + + ```bash + export CELERY_AFFINITY=4 + ``` diff --git a/docs/user_guide/command_line.md b/docs/user_guide/command_line.md new file mode 100644 index 000000000..46bfdc7e4 --- /dev/null +++ b/docs/user_guide/command_line.md @@ -0,0 +1,684 @@ +# Command Line Interface + +The Merlin library defines a number of commands to help configure your server and manage and monitor your workflow. + +This module will detail every command available with Merlin. + +## Merlin + +The entrypoint to everything related to executing Merlin commands. + +**Usage:** + +```bash +merlin [OPTIONS] COMMAND [ARGS] ... +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--version` | boolean | Show program's version number and exit | `False` | +| `-lvl`, `--level` | choice(`ERROR` \| `WARNING` \| `INFO` \| `DEBUG`) | Level of logging messages to be output. The smaller the number in the table below, the more output that's produced:
Log Level Choice
4 ERROR
3 WARNING
2 INFO (default)
1 DEBUG
| INFO | + +See the [Configuration Commands](#configuration-commands), [Workflow Management Commands](#workflow-management-commands), and [Monitoring Commands](#monitoring-commands) below for more information on every command available with the Merlin library. + +## Configuration Commands + +Since running Merlin in a distributed manner requires the [configuration](./configuration/index.md) of a centralized server, Merlin comes equipped with three commands to help users get this set up: + +- *[config](#config-merlin-config)*: Create the skeleton `app.yaml` file needed for configuration +- *[info](#info-merlin-info)*: Ensure stable connections to the server(s) +- *[server](#server-merlin-server)*: Spin up containerized servers + +### Config (`merlin config`) + +Create a default [config (app.yaml) file](./configuration/index.md#the-appyaml-file) in the `${HOME}/.merlin` directory using the `config` command. This file can then be edited for your system configuration. + +See more information on how to set this file up at the [Configuration](./configuration/index.md) page. + +**Usage:** + +```bash +merlin config [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--task_server` | string | Select the appropriate configuration for the given task server. Currently only "celery" is implemented. | "celery" | +| `-o`, `--output_dir` | path | Output the configuration in the given directory. This file can then be edited and copied into `${HOME}/.merlin`. | None | +| `--broker` | string | Write the initial `app.yaml` config file for either a `rabbitmq` or `redis` broker. The default is `rabbitmq`. The backend will be `redis` in both cases. The redis backend in the `rabbitmq` config shows the use on encryption for the backend. | "rabbitmq" | + +**Examples:** + +!!! example "Create an `app.yaml` File at `~/.merlin`" + + ```bash + merlin config + ``` + +!!! example "Create an `app.yaml` File at a Custom Path" + + ```bash + merlin config -o /Documents/configuration/ + ``` + +!!! example "Create an `app.yaml` File With a Redis Broker" + + ```bash + merlin config --broker redis + ``` + +### Info (`merlin info`) + +Information about your Merlin and Python configuration can be printed out by using the `info` command. This is helpful for debugging. Included in this command is a server check which will check for server connections. The connection check will timeout after 60 seconds. + +**Usage:** + +```bash +merlin info [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | + +### Server (`merlin server`) + +Create a local containerized server for Merlin to connect to. Merlin server creates and configures a server on the current directory. This allows multiple instances of Merlin server to exist for different studies or uses. + +Merlin server has a list of commands for interacting with the broker and results server. These commands allow the user to manage and monitor the exisiting server and create instances of servers if needed. + +More information on configuring with Merlin server can be found at the [Merlin Server Configuration](./configuration/merlin_server.md) page. + +**Usage:** + +``` +merlin server [OPTIONS] COMMAND [ARGS] ... +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | + +**Commands:** + +| Name | Description | +| ------------ | ----------- | +| [init](#server-init-merlin-server-init) | Initialize the files needed for Merlin server | +| [status](#server-status-merlin-server-status) | Check the status of your Merlin server | +| [start](#server-start-merlin-server-start) | Start the containerized Merlin server | +| [stop](#server-stop-merlin-server-stop) | Stop the Merlin server | +| [restart](#server-restart-merlin-server-restart) | Restart an instance of the Merlin server | +| [config](#server-config-merlin-server-config) | Configure the Merlin server | + +#### Server Init (`merlin server init`) + +!!! note + + If there is an exisiting subdirectory containing a merlin server configuration then only missing files will be replaced. However it is recommended that users backup their local configurations prior to running this command. + +The `init` subcommand initalizes a new instance of Merlin server by creating configurations for other subcommands. + +A main Merlin sever configuration subdirectory is created at `~/.merlin/server/` which contains configuration for local Merlin configuration, and configurations for different containerized services that Merlin server supports, which includes Singularity (Docker and Podman implemented in the future). + +A local Merlin server configuration subdirectory called `merlin_server/` will also be created in your current working directory when this command is run. This will include a container for merlin server and associated configuration files that might be used to start the server. For example, for a redis server a "redis.conf" will contain settings which will be dynamically loaded when the redis server is run. This local configuration will also contain information about currently running containers as well. + +**Usage:** + +```bash +merlin server init [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | + +#### Server Status (`merlin server status`) + +The `status` subcommand checks the status of the Merlin server. + +**Usage:** + +```bash +merlin server status [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | + +#### Server Start (`merlin server start`) + +!!! warning + + Newer versions of Redis have started requiring a global variable `LC_ALL` to be set in order for this to work. To set this properly, run: + + ```bash + export LC_ALL="C" + ``` + + If this is not set, the `merlin server start` command may seem to hang until you manually terminate it. + +The `start` subcommand starts the Merlin server using the container located in the local merlin server configuration. + +**Usage:** + +```bash +merlin server start [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | + +#### Server Stop (`merlin server stop`) + +The `stop` subcommand stops any exisiting container being managed and monitored by Merlin server. + +**Usage:** + +```bash +merlin server stop [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | + +#### Server Restart (`merlin server restart`) + +The `restart` subcommand performs a `stop` command followed by a `start` command on the Merlin server. + +**Usage:** + +```bash +merlin server restart [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | + +#### Server Config (`merlin server config`) + +The `config` subcommand edits configurations for the Merlin server. There are multiple options to allow for different configurations. + +**Usage:** + +```bash +merlin server config [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `-ip`, `--ipadress` | string | Set the binded IP address for Merlin server | None | +| `-p`, `--port` | integer | Set the binded port for Merlin server | None | +| `-pwd`, `--password` | filename | Set the password file for Merlin server | None | +| `--add-user` | string string | Add a new user for Merlin server. This requires a space-delimited username and password as input. | None | +| `--remove-user` | string | Remove an existing user from Merlin server | None | +| `-d`, `--directory` | path | Set the working directory for Merlin server | None | +| `-ss`, `--snapshot-seconds` | integer | Set the number of seconds before each snapshot | None | +| `-sc`, `--snapshot-changes` | integer | Set the number of database changes before each snapshot | None | +| `-sf`, `--snapshot-file` | filename | Set the name of the snapshot file | None | +| `-am`, `--append-mode` | choice(`always` \| `everysec` \| `no`) | Set the appendonly mode | None | +| `-af`, `--append-file` | filename | Set the name of the file for the server append/change file | None | + +**Examples:** + +!!! example "Configure The Port and Password" + + ```bash + merlin server config -p 5879 -pwd /Documents/redis.pass + ``` + +!!! example "Add A User and Set Snapshot File" + + ```bash + merlin server config --add-user custom_username custom_password -sf /Documents/snapshot + ``` + +## Workflow Management Commands + +The Merlin library provides several commands for setting up and managing your Merlin workflow: + +- *[example](#example-merlin-example)*: Download pre-made workflow specifications that can be modified for your own workflow needs +- *[purge](#purge-merlin-purge)*: Clear any tasks that are currently living in the central server +- *[restart](#restart-merlin-restart)*: Restart a workflow +- *[run](#run-merlin-run)*: Send tasks to the central server +- *[run workers](#run-workers-merlin-run-workers)*: Start up workers that will execute the tasks that exist on the central server +- *[stop workers](#stop-workers-merlin-stop-workers)*: Stop existing workers + +### Example (`merlin example`) + +If you want to obtain an example workflow, use Merlin's `merlin example` command. First, view all of the example workflows that are available with: + +```bash +merlin example list +``` + +This will list the available example workflows and a description for each one. To select one: + +```bash +merlin example +``` + +This will copy the example workflow to the current working directory. It is possible to specify another path to copy to. + +```bash +merlin example -p path/to/dir +``` + +If the specified directory does not exist Merlin will automatically create it. + +This will generate the example workflow at the specified location, ready to be run. + +For more information on these examples, visit the [Examples](../examples/index.md) page. + +**Usage:** + +```bash +merlin example [OPTIONS] [list | ] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `-p`, `--path` | path | A directory path to download the example to | Current Working Directory | + +### Purge (`merlin purge`) + +!!! warning + + Any tasks reserved by workers will not be purged from the queues. All workers must be first stopped so the tasks can be returned to the task server and then they can be purged. + + In short, you probably want to use [`merlin stop-workers`](#stop-workers-merlin-stop-workers) before running `merlin purge`. + +If you've executed the [`merlin run`](#run-merlin-run) command and sent tasks to the server, this command can be used to remove those tasks from the server. If there are no tasks currently on the server then this command will not do anything. + +**Usage:** + +``` +merlin purge [OPTIONS] SPECIFICATION +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `-f` | boolean | Purge tasks without confirmation | `False` | +| `--steps` | List[string] | A space-delimited list of steps from the specification file to purge | `['all']` | +| `--vars` | List[string] | A space-delimited list of variables to override in the spec file. Ex: `--vars MY_QUEUE=hello` | None | + +**Examples:** + +!!! example "Purge All Queues From Spec File" + + The following command will purge all queues that exist in `my_specification.yaml`: + + ```bash + merlin purge my_specification.yaml + ``` + +!!! example "Purge Specific Steps From Spec File" + + The following command will purge any queues associated with `step_1` and `step_3` in `my_specification.yaml`: + + ```bash + merlin purge my_specification.yaml --steps step_1 step_3 + ``` + +!!! example "Purge Queues Without Confirmation" + + The following command will ignore the confirmation prompt that's provided and purge the queues: + + ```bash + merlin purge -f my_specification.yaml + ``` + +### Restart (`merlin restart`) + +To restart a previously started Merlin workflow, use the `restart` command and the path to root of the Merlin workspace that was generated during the previously run workflow. This will define the tasks and queue them on the task server also called the broker. + +Merlin currently writes file called `MERLIN_FINISHED` to the directory of each step that was finished successfully. It uses this to determine which steps to skip during execution of a restarted workflow. + +**Usage:** + +```bash +merlin restart [OPTIONS] WORKSPACE +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--local` | string | Run tasks sequentially in your current shell | "distributed" | + +**Examples:** + +!!! example "Restart an Existing Workflow" + + ```bash + merlin restart my_study_20240102-143903/ + ``` + +!!! example "Restart an Existing Workflow Locally" + + ```bash + merlin restart my_study_20240102-143903/ --local + ``` + +### Run (`merlin run`) + +To run a Merlin workflow use the `run` command and the path to the input yaml file ``. This will define the tasks and queue them on the task server also called the broker. + +**Usage:** + +```bash +merlin run [OPTIONS] SPECIFICATION +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--local` | string | Run tasks sequentially in your current shell | "distributed" | +| `--vars` | List[string] | A space-delimited list of variables to override in the spec file. This list should be given after the spec file is provided. Ex: `--vars LEARN=/path/to/new_learn.py EPOCHS=3` | None | +| `--samplesfile` | choice(`.npy` \| `.csv` \| `.tab`) | Specify a file containing samples. This file should be given after the spec file is provided. | None | +| `--dry` | boolean | Do a [Dry Run](./running_studies.md#dry-runs) of your workflow | `False` | +| `--no-errors` | boolean | Silence the errors thrown when flux is not present | `False` | +| `--pgen` | filename | Specify a parameter generator filename to override the `global.parameters` block of your spec file | None | +| `--pargs` | string | A string that represents a single argument to pass a custom parameter generation function. Reuse `--parg` to pass multiple arguments. [Use with `--pgen`] | None | + +**Examples:** + +!!! example "Basic Run Example" + + ```bash + merlin run my_specification.yaml + ``` + +!!! example "Pass A Parameter Generator File to Run" + + ```bash + merlin run my_specification.yaml --pgen /path/to/pgen.py + ``` + +!!! example "Pass A Samples File to Run" + + ```bash + merlin run my_specification.yaml --samplesfile /path/to/samplesfile.csv + ``` + +!!! example "Do A Dry Run of Your Workflow Locally" + + ```bash + merlin run my_specification.yaml --dry --local + ``` + +### Run Workers (`merlin run-workers`) + +The tasks queued on the broker by the [`merlin run`](#run-merlin-run) command are run by a collection of workers. These workers can be run local in the current shell or in parallel on a batch allocation. The workers are launched using the `run-workers` command which reads the configuration for the worker launch from the `` file. + +Within the `` file, the `batch` and `merlin.resources.workers` sections are both used to configure the worker launch. The top level `batch` section can be overridden in the `merlin.resources.workers` section. Parallel workers should be scheduled using the system's batch scheduler (see the section describing [Distributed Runs](./running_studies.md#distributed-runs) for more info). + +Once the workers are running, tasks from the broker will be processed. + +**Usage:** + +```bash +merlin run-workers [OPTIONS] SPECIFICATION +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--echo` | boolean | Echo the Celery workers run command to stdout and don't start any workers | `False` | +| `--worker-args` | string | Pass arguments (all wrapped in quotes) to the Celery workers. Should be given after the input spec. | None | +| `--steps` | List[string] | The specific steps in the input spec that you want to run the corresponding workers for. Should be given after the input spec. | `['all']` | +| `--vars` | List[string] | A space-delimited list of variables to override in the spec file. This list should be given after the spec file is provided. Ex: `--vars SIMWORKER=new_sim_worker` | None | +| `--disable-logs` | boolean | Disable logs for Celery workers. **Note:** Having the `-l` flag in your workers' args section will overwrite this flag for that worker. | `False` | + +**Examples:** + +!!! example "Basic Worker Launch" + + ```bash + merlin run-workers my_specification.yaml + ``` + +!!! example "Worker Launch for Just Certain Steps" + + ```bash + merlin run-workers my_specification.yaml --steps step_1 step_3 + ``` + +!!! example "Worker Launch with Worker Args Passed" + + ```bash + merlin run-workers my_specification.yaml --worker-args "-l INFO --concurrency 4" + ``` + +### Stop Workers (`merlin stop-workers`) + +!!! warning + + If you've named workers identically across workflows (you shouldn't) only one might get the signal. In this case, you can send it again. + +Send out a stop signal to some or all connected workers. By default, a stop will be sent to all connected workers across all workflows, having them shutdown softly. This behavior can be modified with certain options. + +**Usage:** + +```bash +merlin stop-workers [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--spec` | filename | Target only the workers named in the `merlin` block of the spec file given here | None | +| `--queues` | List[string] | Takes a space-delimited list of specific queues as input and will stop all workers watching these queues | None | +| `--workers` | List[regex] | A space-delimited list of regular expressions representing workers to stop | None | +| `--task_server` | string | Task server type for which to stop the workers. Currently only "celery" is implemented. | "celery" | + +**Examples:** + +!!! example "Stop All Workers Across All Workflows" + + ```bash + merlin stop-workers + ``` + +!!! example "Stop Workers for a Certain Specification" + + ```bash + merlin stop-workers --spec my_specification.yaml + ``` + +!!! example "Stop Workers for Certain Queues" + + ```bash + merlin stop-workers --queues queue_1 queue_2 + ``` + +!!! example "Stop Specific Workers Using Regex" + + ```bash + merlin stop-workers --workers ".*@my_other_host*" + ``` + +## Monitoring Commands + +The Merlin library comes equipped with commands to help monitor your workflow: + +- *[monitor](#monitor-merlin-monitor)*: Keep your allocation alive while tasks are being processed +- *[query-workers](#query-workers-merlin-query-workers)*: Communicate with Celery to view information on active workers +- *[status](#status-merlin-status)*: Communicate with Celery to view the status of queues in your workflow(s) + +### Monitor (`merlin monitor`) + +Batch submission scripts may not keep the batch allocation alive if there is not a blocking process in the submission script. The `merlin monitor` command addresses this by providing a blocking process that checks for tasks in the queues every (sleep) seconds ("sleep" here can be defined with the `--sleep` option). When the queues are empty, the monitor will query Celery to see if any workers are still processing tasks from the queues. If no workers are processing any tasks from the queues and the queues are empty, the blocking process will exit and allow the allocation to end. + +The `monitor` functionality will check for Celery workers for up to 10*(sleep) seconds before monitoring begins. The loop happens when the queue(s) in the spec contain tasks, but no running workers are detected. This is to protect against a failed worker launch. + +**Usage:** + +```bash +merlin monitor [OPTIONS] SPECIFICATION +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--steps` | List[string] | A space-delimited list of steps in the input spec that you want to query. Should be given after the input spec. | `['all']` | +| `--vars` | List[string] | A space-delimited list of variables to override in the spec file. This list should be given after the spec file is provided. Ex: `--vars SIMWORKER=new_sim_worker` | None | +| `--sleep` | integer | The duration in seconds between checks for workers/tasks | 60 | +| `--task_server` | string | Task server type for which to monitor the workers. Currently only "celery" is implemented. | "celery" | + +!!! example "Basic Monitor" + + ```bash + merlin monitor my_specification.yaml + ``` + +!!! example "Monitor Specific Steps" + + ```bash + merlin monitor my_specification.yaml --steps step_1 step_3 + ``` + +!!! example "Monitor With a Shortened Sleep Interval" + + ```bash + merlin monitor my_specification.yaml --sleep 30 + ``` + +### Query Workers (`merlin query-workers`) + +Check which workers are currently connected to the task server. + +This will broadcast a command to all connected workers and print the names of any that respond and the queues they're attached to. This is useful for interacting with workers, such as via `merlin stop-workers --workers`. + +**Usage:** + +```bash +merlin query-workers [OPTIONS] +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--task_server` | string | Task server type for which to query workers. Currently only "celery" is implemented. | "celery" | +| `--spec` | filename | Query for the workers named in the `merlin` block of the spec file given here | None | +| `--queues` | List[string] | Takes a space-delimited list of queues as input. This will query for workers associated with the names of the queues you provide here. | None | +| `--workers` | List[regex] | A space-delimited list of regular expressions representing workers to query | None | + +**Examples:** + +!!! example "Query All Active Workers" + + ```bash + merlin query-workers + ``` + +!!! example "Query Workers of Specific Queues" + + ```bash + merlin query-workers --queues demo merlin + ``` + +!!! example "Query Workers From Spec File" + + ``` + merlin query-workers --spec my_specification.yaml + ``` + +!!! example "Query Workers Based on Their Name" + + This will query a worker named `step_1_worker`: + + ``` + merlin query-workers --workers step_1_worker + ``` + +!!! example "Query Workers Using Regex" + + This will query only workers whose names start with `step`: + + ```bash + merlin query-workers --workers ^step + ``` + +### Status (`merlin status`) + +Check the status of the queues in your spec file to see if there are any tasks in them and any active workers watching them. + +**Usage:** + +```bash +merlin status [OPTIONS] SPECIFICATION +``` + +**Options:** + +| Name | Type | Description | Default | +| ------------ | ------- | ----------- | ------- | +| `-h`, `--help` | boolean | Show this help message and exit | `False` | +| `--steps` | List[string] | A space-delimited list of steps in the input spec that you want to query. Should be given after the input spec. | `['all']` | +| `--vars` | List[string] | A space-delimited list of variables to override in the spec file. This list should be given after the spec file is provided. Ex: `--vars QUEUE_NAME=new_queue_name` | None | +| `--task_server` | string | Task server type. Currently only "celery" is implemented. | "celery" | +| `--csv` | filename | The name of a csv file to dump the queue status report to | None | + +**Examples:** + +!!! example "Basic Status Check" + + ```bash + merlin status my_specification.yaml + ``` + +!!! example "Check the Status of Queues for Certain Steps" + + ```bash + merlin status my_specification.yaml --steps step_1 step_3 + ``` + +!!! example "Dump the Status to a CSV File" + + ```bash + merlin status my_specification.yaml --csv status_report.csv + ``` diff --git a/docs/user_guide/configuration/external_server.md b/docs/user_guide/configuration/external_server.md new file mode 100644 index 000000000..9b86e3512 --- /dev/null +++ b/docs/user_guide/configuration/external_server.md @@ -0,0 +1,407 @@ +# External Server Configuration + +!!! warning + + It's recommended that you read through the [Configuration Overview](./index.md) page before proceeding with this module. + +## Configuring the Broker + +In the `broker` section of your app.yaml you will provide all of the necessary settings for Merlin to establish a connection with your broker. + +Here, we'll discuss: + +- The different [Broker Options](#broker-options) that are allowed +- How the [password field is resolved](#resolving-the-broker-password-field) +- The [URL](#broker-url) option + +### Broker Options + +Merlin allows for several different broker configurations. This section will detail each of these options and how to configure them. **We recommend using RabbitMQ as your broker.** + +#### RabbitMQ, AMQPS, and AMQP + +See the [RabbitMQ Documentation](https://rabbitmq.com/) for instructions on how to create a RabbitMQ server. + +Once your server is set up, we'll need six keys in the `broker` section of the app.yaml file: + +1. `name`: The name of the broker (options are `rabbitmq`, `amqps`, or `amqp`) +2. `username`: Username for RabbitMQ user that will be used for accessing the service +3. `password`: The path to the file that's storing your RabbitMQ password +4. `server`: A URL to the server that you're connecting +5. `port`: The port that your server is running on. If left undefined and the `name` setting is: + - `rabbitmq` or `amqps` the default RabbitMQ TLS port (5671) will be used + - `amqp` the default port will be 5672 +6. `vhost`: The vhost for your RabbitMQ service + +Using these settings, Merlin will construct a connection string of the form: + +```bash +{conn}://{username}:{password}@{server}:{port}/{vhost} +``` + +Here `conn` is `amqps` (with ssl) when the `name` field is `rabbitmq` or `amqps`, and `amqp` (without ssl) when the `name` field is `amqp`. If you're using a connection option with ssl, see the [Security With RabbitMQ](#security-with-rabbitmq) section below. + +!!! example + + Let's say we create a file in our `~/.merlin/` directory called `rabbit.pass`. In this file we'll store the password for our RabbitMQ server: + + ```bash title="~/.merlin/rabbit.pass" linenums="1" + my-super-secret-password + ``` + + Now we'll update the `broker` section of our app.yaml file to be: + + ```yaml title="~/.merlin/app.yaml" + broker: + name: rabbitmq + username: rabbit-username + password: ~/.merlin/rabbit.pass + server: server.domain.com + vhost: rabbit-vhost + ``` + + The connection string that Merlin generates will then become: + + ```bash + amqps://rabbit-username:my-super-secret-password@server.domain.com:5671/rabbit-vhost + ``` + +##### Security With RabbitMQ + +Merlin can only be configured to communicate with [RabbitMQ over an SSL connection](https://www.rabbitmq.com/ssl.html) and does not permit use of a RabbitMQ server configured without SSL. Therefore, the default value of the `broker_use_ssl` celery argument is `True` for RabbitMQ. + +The keys can be given in the broker config as shown below: + +```yaml +broker: + name: rabbitmq + username: rabbit-username + password: ~/.merlin/rabbit.pass + server: server.domain.com + vhost: rabbit-vhost + + # ssl security + keyfile: /var/ssl/private/client-key.pem + certfile: /var/ssl/amqp-server-cert.pem + ca_certs: /var/ssl/myca.pem + # This is optional and can be required, optional or none + # (required is the default) + cert_reqs: required +``` + +The ssl config with rabbitmq/amqps in the broker is then placed in the `broker_use_ssl` celery argument. + +```py +broker_use_ssl = { + 'keyfile': '/var/ssl/private/client-key.pem', + 'certfile': '/var/ssl/amqp-server-cert.pem', + 'ca_certs': '/var/ssl/myca.pem', + 'cert_reqs': ssl.CERT_REQUIRED +} +``` + +#### Redis and Rediss + +!!! note + + If you're using Redis v6.0.0+ and would like to configure with ssl, you'll need to view the [Security With Rediss](#security-with-rediss) section below after completing this section. + +See the [Redis Documentation](https://redis.io/) for instructions on how to create a Redis server. + +Once your server is set up, we'll need five keys in the `broker` section of the app.yaml file: + +1. `name`: The name of the broker (here it will be `redis` if running *without* ssl, or `rediss` if running *with* ssl) +2. `password`: The path to the file that's storing your Redis password +3. `server`: A URL to the server that you're connecting +4. `port`: The port that your server is running on. Default is 6379. +5. `db_num`: The database index (this will likely be 0). + +Using these settings, Merlin will construct a connection string of the form: + +```bash +{name}://:{password}@{server}:{port}/{db_num} +``` + +If using ssl, see [Security With Rediss](#security-with-rediss) for additional setup instructions. + +!!! example + + Let's say we create a file in our `~/.merlin/` directory called `redis.pass`. In this file we'll store the password for our Redis server: + + ```bash title="~/.merlin/redis.pass" linenums="1" + my-super-secret-password + ``` + + Now we'll update the `broker` section of our app.yaml file to be: + + ```yaml title="~/.merlin/app.yaml" + broker: + name: redis + password: ~/.merlin/redis.pass + server: server.domain.com + port: 6379 + db_num: 0 + ``` + + The connection string that Merlin generates will then become: + + ```bash + redis://:my-super-secret-password@server.domain.com:6379/0 + ``` + +##### Security With Rediss + +When using Redis with ssl, aka rediss, (only available with Redis v6.0.0+), there are some additional keys that you'll need to add to your `broker` section: + +```yaml +broker: + name: rediss + password: ~/.merlin/redis.pass + server: server.domain.com + port: 6379 + db_num: 0 + + + # ssl security + keyfile: /var/ssl/private/client-key.pem + certfile: /var/ssl/amqp-server-cert.pem + ca_certs: /var/ssl/myca.pem + # This is optional and can be required, optional or none + # (required is the default) + cert_reqs: required +``` + +The ssl config with redis (rediss) in the broker is then placed in the `broker_use_ssl` celery argument. + +```python +broker_use_ssl = { + 'ssl_keyfile': '/var/ssl/private/client-key.pem', + 'ssl_certfile': '/var/ssl/amqp-server-cert.pem', + 'ssl_ca_certs': '/var/ssl/myca.pem', + 'ssl_cert_reqs': ssl.CERT_REQUIRED +} +``` + +#### Redis+Socket + +Celery supports Redis connections using Unix domain sockets. For this setup, three keys are required in the `broker` section: + +1. `name`: The name of the broker. This will be `redis+socket` here. +2. `path`: The path to the Unix domain socket file for Redis +3. `db_num`: The database index + +Using these settings, Merlin will construct a connection string of the form: + +```bash +redis+socket://{path}?virtual_host={db_num} +``` + +!!! example + + Let's set the `broker` configuration to be: + + ```yaml title="~/.merlin/app.yaml" + broker: + name: redis+socket + path: /tmp/username/redis.sock + db_num: 0 + ``` + + The connection string that Merlin generates will then become: + + ```bash + redis+socket:///tmp/username/redis.sock?virtual_host=0 + ``` + +### Resolving The Broker Password Field + +The `broker/password` is simply the full path to a file containing your password for the user defined by `broker/username`. + +!!! example + + Say the password to our server is `my-super-secret-password`. We'd simply take this password, place it in a file, and then link the path to the file in the `broker` section. + + ```bash title="~/.merlin/password-file.pass" linenums="1" + my-super-secret-password + ``` + + ```yaml title="~/.merlin/app.yaml" + broker: + password: ~/.merlin/password-file.pass + ``` + +### Broker URL + +A `url` option is available to specify the broker connection url, in which case the server name is ignored. The url must include the entire connection url except the ssl if the broker name is recognized by the ssl processing system. Currently the ssl system will only configure the Rabbitmq and Redis servers. + +Using the `url` setting, Merlin will construct a connection string of the form: + +```bash +{url} +``` + +!!! example + + Say we use the default Redis server for our broker: + + ```yaml title="~/.merlin/app.yaml" + broker: + url: redis://localhost:6379/0 + ``` + + The connection string that Merlin generates will then become: + + ```bash + redis://localhost:6379/0 + ``` + +## Configuring the Results Backend + +In the `results_backend` section of your app.yaml you will provide all of the necessary settings for Merlin to establish a connection with your results backend. + +Here, we'll discuss: + +- The different [Results Backend Options](#results-backend-options) that are allowed +- How the [password field is resolved](#resolving-the-results-backend-password-field) +- The [URL](#results-backend-url) option + +### Results Backend Options + +Merlin allows for several different results backend configurations. This section will detail each of these options and how to configure them. **We recommend using Redis as your results backend.** + +#### Redis and Rediss + +!!! note + + If you're using Redis v6.0.0+ and would like to configure with ssl, you'll need to view the [Security With Rediss](#security-with-rediss_1) section below after completing this section. + +The recommended option to use for your results backend is a Redis server. See the [Redis Documentation](https://redis.io/) for instructions on how to create a Redis server. + +Once your server is set up, we'll need six keys in the `results_backend` section of the app.yaml file: + +1. `name`: The name of the results backend (here it will be `redis` if running *without* ssl, or `rediss` if running *with* ssl) +2. `password`: The path to the file that's storing your Redis password +3. `server`: A URL to the server that you're connecting +4. `port`: The port that your server is running on. Default is 6379. +5. `db_num`: The database index (this will likely be 0). +6. `encryption_key`: The path to the encryption key (this is automatically generated by `merlin config`) + +Using these settings, Merlin will construct a connection string of the form: + +```bash +{name}://:{password}{server}:{port}/{db_num} +``` + +To further understand what the `encryption_key` is for, see [Security With Redis](#security-with-redis). + +If using ssl, see [Security With Rediss](#security-with-rediss_1) for additional setup instructions. + +!!! example + + Let's say we create a file in our `~/.merlin/` directory called `redis.pass`. In this file we'll store the password for our Redis server: + + ```bash title="~/.merlin/redis.pass" linenums="1" + my-super-secret-password + ``` + + Now we'll update the `results_backend` section of our app.yaml file to be: + + ```yaml title="~/.merlin/app.yaml" + results_backend: + name: redis + password: ~/.merlin/redis.pass + server: server.domain.com + port: 6379 + db_num: 0 + encryption_key: ~/.merlin/encrypt_data_key + ``` + + The connection string that Merlin generates will then become: + + ```bash + redis://:my-super-secret-password@server.domain.com:6379/0 + ``` + +##### Security With Redis + +Redis versions less than 6 do not natively support multiple users or SSL. We address security concerns here by redefining the core Celery routine that communicates with +redis to encrypt all data it sends to redis and then decrypt anything it receives. Each user should have their own encryption key as defined by +`results_backend/encryption_key` in the app.yaml file. Merlin will generate a key if that key does not yet exist. + +##### Security With Rediss + +When using Redis with ssl, aka rediss, (only available with Redis v6.0.0+), there are some additional keys that you'll need to add to your `results_backend` section: + +```yaml +results_backend: + name: rediss + password: ~/.merlin/redis.pass + server: server.domain.com + port: 6379 + db_num: 0 + encryption_key: ~/.merlin/encrypt_data_key + + + # ssl security + keyfile: /var/ssl/private/client-key.pem + certfile: /var/ssl/amqp-server-cert.pem + ca_certs: /var/ssl/myca.pem + # This is optional and can be required, optional or none + # (required is the default) + cert_reqs: required +``` + +The ssl config with redis (rediss) in the results backend is then placed in the `redis_backend_use_ssl` celery argument. + +```python +redis_backend_use_ssl = { + 'ssl_keyfile': '/var/ssl/private/client-key.pem', + 'ssl_certfile': '/var/ssl/amqp-server-cert.pem', + 'ssl_ca_certs': '/var/ssl/myca.pem', + 'ssl_cert_reqs': ssl.CERT_REQUIRED +} +``` + +#### MySQL + +Coming soon! + +### Resolving The Results Backend Password Field + +The `results_backend/password` is interpreted in the following way. First, it is treated as an absolute path to a file containing your backend password. If that path doesn't exist, it then looks for a file of that name under the directory defined under `celery/certs`. If that file doesn't exist, it then treats `results_backend/password` as the password itself. + +### Results Backend URL + +A `url` option is available to specify the results backend connection url, in which case the server name is ignored. The url must include the entire connection url including the ssl configuration. + +Using the `url` setting, Merlin will construct a connection string of the form: + +```bash +{url} +``` + +!!! example + + Say we use the default Redis server for our results backend: + + ```yaml title="~/.merlin/app.yaml" + results_backned: + url: redis://localhost:6379/0 + ``` + + The connection string that Merlin generates will then become: + + ```bash + redis://localhost:6379/0 + ``` + +The `url` option can also be used to define a server that is not explicitly handled by the merlin configuration system. + +!!! example + + Say we have a PostgreSQL database that we want to connect. We can simply copy/paste the url to the `results_backend` section: + + ```yaml title="~/.merlin/app.yaml" + results_backend: + url: db+postgresql://scott:tiger@localhost/mydatabase + ``` diff --git a/docs/user_guide/configuration/index.md b/docs/user_guide/configuration/index.md new file mode 100644 index 000000000..e8a551da9 --- /dev/null +++ b/docs/user_guide/configuration/index.md @@ -0,0 +1,266 @@ +# Configuration + +!!! note + + Merlin works best configuring [Celery](https://docs.celeryq.dev/en/stable/index.html) to run with a [RabbitMQ](https://www.rabbitmq.com/) broker and a [Redis](https://redis.io/) backend. Merlin uses Celery chords which require a results backend to be configured. The Amqp (rpc RabbitMQ) server does not support chords but the Redis, Database, Memcached and more, support chords. + +The [Celery](https://docs.celeryq.dev/en/stable/index.html) library provides several ways to configure both your broker and your results backend. This page will go over why configuration is necessary and will detail the different configurations that Merlin supports. + +## Why is Configuration Necessary? + +As explained in the [User Guide Landing Page](../index.md), Merlin uses a central server to store tasks in a queue which workers will manage. To establish this functionality, Merlin uses the [Celery](https://docs.celeryq.dev/en/stable/index.html) library. Because of this, Merlin requires users to configure a broker and results backend. + +### What is a Broker? + +A broker is a message queue that acts as an intermediary between the sender of a task and the worker processes that execute the task. It facilitates the communication between different parts of a distributed system by passing messages (tasks) from producers (the code that generates tasks) to consumers (worker processes that execute tasks). + +The broker is responsible for queuing the tasks and delivering them to the appropriate worker processes. It allows for the decoupling of the task producer and the task consumer, enabling a more scalable and flexible architecture. + +[Celery supports various message brokers](https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/index.html), including [RabbitMQ](https://www.rabbitmq.com/), [Redis](https://redis.io/), and others. You can configure Celery to use a specific broker based on your requirements (although we suggest using RabbitMQ). + +See the [Configuring the Broker and Results Backend](#configuring-the-broker-and-results-backend) section below for more information on configuring your broker. + +### What is a Results Backend? + +The results backend is a storage system where the results of executed tasks are stored. After a task is executed by a worker, the result is stored in the result backend, and the original task sender can retrieve the result later. + +The results backend enables the asynchronous nature of Celery. Instead of blocking and waiting for a task to complete, the sender can continue with other work and later retrieve the result from the results backend. + +[Celery supports various results backends](https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/index.html), including databases (such as SQLAlchemy, Django ORM), message brokers (Redis, RabbitMQ), and others. You can configure Celery to use a specific broker based on your requirements (although we suggest using Redis). However, since Merlin utilizes [Celery chords](https://docs.celeryq.dev/en/stable/userguide/canvas.html#chords) and the amqp (rpc RabbitMQ) server does not support chords, we cannot use RabbitMQ as a results backend. + +See the [Configuring the Broker and Results Backend](#configuring-the-broker-and-results-backend) section below for more information on configuring your results backend. + +## The app.yaml File + +In order to read in configuration options for your Celery settings, broker, and results backend, Merlin utilizes an app.yaml file. + +There's a built-in command with Merlin to set up a skeleton app.yaml for you: + +```bash +merlin config +``` + +This command will create an app.yaml file in the `~/.merlin/` directory that looks like so: + + +[app.yaml](../../../merlin/data/celery/app.yaml) + + +As you can see there are three key sections to Merlin's app.yaml file: `celery`, `broker`, and `results_backend`. The rest of this page will go into more depth on each. + +## The Celery Section + +In the `celery` section of your app.yaml you can override any Celery settings that you may want to change. + +Merlin's default Celery configurations are as follows: + +??? abstract "Default Celery Configuration" + + ```yaml + accept_content: ['pickle'] # DO NOT MODIFY + result_accept_content: None # DO NOT MODIFY + enable_utc: True + imports: () + include: () + timezone: None + beat_max_loop_interval: 0 + beat_schedule: {} + beat_scheduler: celery.beat:PersistentScheduler + beat_schedule_filename: celerybeat-schedule + beat_sync_every: 0 + beat_cron_starting_deadline: None + broker_url: + broker_read_url: None + broker_write_url: None + broker_transport: None + broker_transport_options: {'visibility_timeout': 86400, 'max_connections': 100} + broker_connection_timeout: 4 + broker_connection_retry: True + broker_connection_retry_on_startup: None + broker_connection_max_retries: 100 + broker_channel_error_retry: False + broker_failover_strategy: None + broker_heartbeat: 120 + broker_heartbeat_checkrate: 3.0 + broker_login_method: None + broker_pool_limit: 0 + broker_use_ssl: + broker_host: + broker_port: + broker_user: + broker_password: + broker_vhost: + cache_backend: None + cache_backend_options: {} + cassandra_entry_ttl: None + cassandra_keyspace: None + cassandra_port: None + cassandra_read_consistency: None + cassandra_servers: None + cassandra_bundle_path: None + cassandra_table: None + cassandra_write_consistency: None + cassandra_auth_provider: None + cassandra_auth_kwargs: None + cassandra_options: {} + s3_access_key_id: None + s3_secret_access_key: None + s3_bucket: None + s3_base_path: None + s3_endpoint_url: None + s3_region: None + azureblockblob_container_name: celery + azureblockblob_retry_initial_backoff_sec: 2 + azureblockblob_retry_increment_base: 2 + azureblockblob_retry_max_attempts: 3 + azureblockblob_base_path: + azureblockblob_connection_timeout: 20 + azureblockblob_read_timeout: 120 + control_queue_ttl: 300.0 + control_queue_expires: 10.0 + control_exchange: celery # DO NOT MODIFY + couchbase_backend_settings: None + arangodb_backend_settings: None + mongodb_backend_settings: None + cosmosdbsql_database_name: celerydb + cosmosdbsql_collection_name: celerycol + cosmosdbsql_consistency_level: Session + cosmosdbsql_max_retry_attempts: 9 + cosmosdbsql_max_retry_wait_time: 30 + event_queue_expires: 60.0 + event_queue_ttl: 5.0 + event_queue_prefix: celeryev + event_serializer: json # DO NOT MODIFY + event_exchange: celeryev # DO NOT MODIFY + redis_backend_use_ssl: + redis_db: + redis_host: + redis_max_connections: 100000 + redis_username: + redis_password: + redis_port: + redis_socket_timeout: 120.0 + redis_socket_connect_timeout: None + redis_retry_on_timeout: False + redis_socket_keepalive: False + result_backend: + result_cache_max: -1 + result_compression: None + result_exchange: celeryresults + result_exchange_type: direct + result_expires: 1 day, 0:00:00 + result_persistent: None + result_extended: False + result_serializer: pickle # DO NOT MODIFY + result_backend_transport_options: {} + result_chord_retry_interval: 1.0 + result_chord_join_timeout: 3.0 + result_backend_max_sleep_between_retries_ms: 10000 + result_backend_max_retries: inf + result_backend_base_sleep_between_retries_ms: 10 + result_backend_always_retry: False + elasticsearch_retry_on_timeout: None + elasticsearch_max_retries: None + elasticsearch_timeout: None + elasticsearch_save_meta_as_text: True + security_certificate: None + security_cert_store: None + security_key: None + security_key_password: None + security_digest: sha256 + database_url: None + database_engine_options: None + database_short_lived_sessions: False + database_table_schemas: None + database_table_names: None + task_acks_late: True # DO NOT MODIFY + task_acks_on_failure_or_timeout: True # DO NOT MODIFY + task_always_eager: False # DO NOT MODIFY + task_annotations: None # DO NOT MODIFY + task_compression: None # DO NOT MODIFY + task_create_missing_queues: True # DO NOT MODIFY + task_inherit_parent_priority: False # DO NOT MODIFY + task_default_delivery_mode: 2 # DO NOT MODIFY + task_default_queue: merlin # DO NOT MODIFY + task_default_exchange: None # DO NOT MODIFY + task_default_exchange_type: direct # DO NOT MODIFY + task_default_routing_key: None # DO NOT MODIFY + task_default_rate_limit: None + task_default_priority: 5 # DO NOT MODIFY + task_eager_propagates: False + task_ignore_result: False + task_store_eager_result: False + task_protocol: 2 # DO NOT MODIFY + task_publish_retry: True + task_publish_retry_policy: {'interval_start': 10, 'interval_step': 10, 'interval_max': 60} + task_queues: None # DO NOT MODIFY + task_queue_max_priority: 10 # DO NOT MODIFY + task_reject_on_worker_lost: True # DO NOT MODIFY + task_remote_tracebacks: False + task_routes: (,) # DO NOT MODIFY + task_send_sent_event: False + task_serializer: pickle # DO NOT MODIFY + task_soft_time_limit: None + task_time_limit: None + task_store_errors_even_if_ignored: False + task_track_started: False + task_allow_error_cb_on_chord_header: False + worker_agent: None # DO NOT MODIFY + worker_autoscaler: celery.worker.autoscale:Autoscaler # DO NOT MODIFY + worker_cancel_long_running_tasks_on_connection_loss: True + worker_concurrency: None # DO NOT MODIFY; this will be set on a worker-by-worker basis that you can customize in your spec file + worker_consumer: celery.worker.consumer:Consumer # DO NOT MODIFY + worker_direct: False # DO NOT MODIFY + worker_disable_rate_limits: False + worker_deduplicate_successful_tasks: False + worker_enable_remote_control: True + worker_hijack_root_logger: True + worker_log_color: True + worker_log_format: [%(asctime)s: %(levelname)s] %(message)s + worker_lost_wait: 10.0 + worker_max_memory_per_child: None + worker_max_tasks_per_child: None + worker_pool: prefork + worker_pool_putlocks: True # DO NOT MODIFY + worker_pool_restarts: False + worker_proc_alive_timeout: 4.0 + worker_prefetch_multiplier: 4 # this can be modified on a worker-by-worker basis in your spec file + worker_redirect_stdouts: True # DO NOT MODIFY + worker_redirect_stdouts_level: WARNING + worker_send_task_events: False + worker_state_db: None + worker_task_log_format: [%(asctime)s: %(levelname)s] [%(task_name)s(%(task_id)s)] %(message)s + worker_timer: None + worker_timer_precision: 1.0 + deprecated_settings: set() + visibility_timeout: 86400 + ``` + +See [Celery's Configuration Settings](https://docs.celeryq.dev/en/stable/userguide/configuration.html#new-lowercase-settings) for more information on each of these settings. + +Overriding these settings is as simple as listing a new key-value pair in the `celery.override` section of your app.yaml. + +!!! example + + To change the `visibility_timeout` and `broker_pool_limit` settings, we'd modify the `celery.override` section of our app.yaml like so: + + ```yaml + celery: + override: + broker_pool_limit: 10 + visibility_timeout: 75000 + ``` + +## Configuring the Broker and Results Backend + +When it comes to configuring the `broker` and `results_backend` sections of your `app.yaml` file, configuration will depend on the type of user you are and what type of servers you wish to use. + +For Livermore Computing (LC) users we recommend configuring with either: + +- [Dedicated LaunchIT Servers](https://lc.llnl.gov/confluence/display/MERLIN/LaunchIT+Configuration) +- [Containerized Servers](./merlin_server.md) + +For all other users, we recommend configuring with either: + +- [Dedicated External Servers](./external_server.md) +- [Containerized Servers](./merlin_server.md) diff --git a/docs/user_guide/configuration/merlin_server.md b/docs/user_guide/configuration/merlin_server.md new file mode 100644 index 000000000..39cf1327d --- /dev/null +++ b/docs/user_guide/configuration/merlin_server.md @@ -0,0 +1,177 @@ +# Merlin Server Configuration + +!!! warning + + It's recommended that you read through the [Configuration Overview](./index.md) page before proceeding with this module. + +The merlin server command allows users easy access to containerized broker and results servers for Merlin workflows. This allows users to run Merlin without a dedicated external server. + +The main configuration will be stored in the subdirectory called `server/` by default in the main Merlin configuration directory `~/.merlin`. However, different server images can be created for different use cases or studies by simplying creating a new directory to store local configuration files for Merlin server instances. + +This module will walk through how to initalize the server, start it, and ensure it's linked to Merlin. + +## Initializing the Server + +First create and navigate into a directory to store your local Merlin configuration for a specific use case or study: + +```bash +mkdir study1/ ; cd study1/ +``` + +Afterwards you can instantiate Merlin server in this directory by running: + +```bash +merlin server init +``` + +A main server configuration will be created in the `~/.merlin/server/` directory. This will have the following files: + +- docker.yaml +- merlin_server.yaml +- podman.yaml +- singularity.yaml + +The main configuration in `~/.merlin/server/` deals with defaults and technical commands that might be used for setting up the Merlin server local configuration and its containers. Each container has their own configuration file to allow users to be able to switch between different containerized services freely. + +In addition to the main server configuration, a local server configuration will be created in your current working directory in a folder called `merlin_server/`. This directory will contain: + +- `redis.conf`: The Redis configuration file that contains all of the settings to be used for our Redis server +- `redis.pass`: A password for the Redis server that we'll start up next +- `redis.users`: A file defining the users that are allowed to access the Redis server and their permissions +- `redis_latest.sif`: A singularity file that contains the latest Redis Docker image that was pulled behind the scenes by Merlin + +The local configuration `merlin_server/` folder contains configuration files specific to a certain use case or run. In the case above you can see that we have a Redis singularity container called `redis_latest.sif` with the Redis configuration file called `redis.conf`. This Redis configuration will allow the user to configure Redis to their specified needs without have to manage or edit the Redis container. When the server is run this configuration will be dynamically read, so settings can be changed between runs if needed. + +Once the Merlin server has been initialized in the local directory the user will be allowed to run other Merlin server commands such as `start`, `status`, and `stop` to interact with the Merlin server. A detailed list of commands can be found in the [Merlin Server](../command_line.md#server-merlin-server) section of the [Command Line](../command_line.md) page. + +!!! note + + Running `merlin server init` again will *not* override any exisiting configuration that the users might have set or edited. By running this command again any missing files will be created for the users with exisiting defaults. *However,* it is highly advised that users back up their configuration in case an error occurs where configuration files are overriden. + +## Starting the Server and Linking it to Merlin + +!!! bug + + For LC users, servers cannot be started outside your home (`~/`) directory. + +!!! warning + + Newer versions of Redis have started requiring a global variable `LC_ALL` to be set in order for this to work. To set this properly, run: + + ```bash + export LC_ALL="C" + ``` + + If this is not set, the `merlin server start` command may seem to run forever until you manually terminate it. + +After initializing the server, starting the server is as simple as running: + +```bash +merlin server start +``` + +You can check that the server was started properly with: + +```bash +merlin server status +``` + +The `merlin server start` command will add new files to the local configuration `merlin_server/` folder: + +- `merlin_server.pf`: A process file containing information regarding the Redis process +- `app.yaml`: A new app.yaml file configured specifically for the containerized Redis server that we just started + +To have Merlin read this server configuration: + +=== "Copy Configuration to CWD" + + ```bash + cp merlin_server/app.yaml . + ``` + +=== "Make This Server Configuration Your Main Configuration" + + If you're going to use the server configuration as your main configuration, it's a good idea to make a backup of your current server configuration (if you have one): + + ```bash + mv ~/.merlin/app.yaml ~/.merlin/app.yaml.bak + ``` + + From here, simply copy the server configuration to your `~/.merlin/` folder: + + ```bash + cp merlin_server/app.yaml ~/.merlin/app.yaml + ``` + +You can check that Merlin recognizes the containerized server connection with: + +```bash +merlin info +``` + +If your servers are running and set up properly, this should output something similar to this: + +???+ success + + ```bash + * + *~~~~~ + *~~*~~~* __ __ _ _ + / ~~~~~ | \/ | | (_) + ~~~~~ | \ / | ___ _ __| |_ _ __ + ~~~~~* | |\/| |/ _ \ '__| | | '_ \ + *~~~~~~~ | | | | __/ | | | | | | | + ~~~~~~~~~~ |_| |_|\___|_| |_|_|_| |_| + *~~~~~~~~~~~ + ~~~*~~~* Machine Learning for HPC Workflows + + + + Merlin Configuration + ------------------------- + + config_file | /path/to/app.yaml + is_debug | False + merlin_home | /path/to/.merlin + merlin_home_exists | True + broker server | redis://default:******@127.0.0.1:6379/0 + broker ssl | False + results server | redis://default:******@127.0.0.1:6379/0 + results ssl | False + + Checking server connections: + ---------------------------- + broker server connection: OK + results server connection: OK + + Python Configuration + ------------------------- + + $ which python3 + /path/to/python3 + + $ python3 --version + Python x.y.z + + $ which pip3 + /path/to/pip3 + + $ pip3 --version + pip x.y.x from /path/to/pip (python x.y) + + "echo $PYTHONPATH" + ``` + +## Stopping the Server + +Once you're done using your containerized server, it can be stopped with: + +```bash +merlin server stop +``` + +You can check that it's no longer running with: + +```bash +merlin server status +``` diff --git a/docs/user_guide/contributing.md b/docs/user_guide/contributing.md new file mode 100644 index 000000000..922baad76 --- /dev/null +++ b/docs/user_guide/contributing.md @@ -0,0 +1,193 @@ +# Contributing + +Welcome to the Merlin developer documentation! This module provides instructions for contributing to Merlin. + +## Getting Started + +Follow the [Developer Setup](./installation.md#developer-setup) documentation to setup your Merlin development environment (we recommend using the [Make Setup](./installation.md#make-setup)). + +Once your development environment is setup ensure you're on the development branch: + +```bash +git checkout develop +``` + +Then create a new branch for whatever you're working on. Typically branch names will start with one of the following prefixes: `feature`, `bugfix`, `refactor`, or `docs`. The following command will create a new branch for you and switch to it: + +```bash +git switch -c / +``` + +Merlin follows a [gitflow workflow](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow). Updates to the develop branch are made via pull requests. + +## Developer Guide + +This section provides Merlin's guide for contributing features/bugfixes to Merlin. + +### Pull Request Checklist + +!!! warning + + All pull requests must pass `make tests` prior to consideration! + +To expedite review, please ensure that pull requests... + +- Are from a meaningful branch name (e.g. `feature/cool_thing`) + +- Are being merged into the [appropriate branch](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow) (likely [Merlin's develop branch](https://github.com/LLNL/merlin/tree/develop)) + +- Include testing for any new features + + - unit tests in `tests/unit` + - integration tests in `tests/integration` + +- Include descriptions of the changes + + - a summary in the pull request + - details in the `[Unreleased]` section of the `CHANGELOG.md` + +- Ran `make fix-style` to adhere to style guidelines + + - it's best practice to run `make check-style` too to ensure no further linter changes need to be manually fixed + +- Pass `make tests`; output included in pull request + +### Testing + +All pull requests must pass unit and integration tests. To ensure that they do run: + +```bash +make tests +``` + +All pull requests that include bugfixes or new features must have new tests in `tests/unit` and/or `tests/integration`. See the [README](https://github.com/LLNL/merlin/blob/develop/tests/README.md) in the test suite for instructions on how to write your tests. You can also view the [Reference Guide](../api_reference/index.md) to see API docs for the test suite. + + + +### Python Code Style Guide + +This section documents Merlin's style guide. Unless otherwise specified, [PEP-8](https://www.python.org/dev/peps/pep-0008/) is the preferred coding style and [PEP-0257](https://www.python.org/dev/peps/pep-0257/) for docstrings. + +!!! note + + Running the following command from the root of the Merlin repository should automatically fix most styling issues: + + ```bash + make fix-style + ``` + +Merlin has style checkers configured. They can be run from the Makefile: + +```bash +make check-style +``` + +### Adding New Features to YAML Spec File + +!!! note "Block vs Property" + + To provide clarity for the following section, we need to discuss what's meant by a "block" vs a "property" of the [YAML spec](./specification.md). + + **Block:** A block is anything at the first level of a YAML spec. Merlin comes equipped with 7 blocks: `description`, `environment`, `global.parameters`, `batch`, `study`, `merlin`, and `user`. + + **Property:** A property is any keyword defined within a block. + + Here's an example: + + ```yaml + description: # Block - description + name: hello # Property - name + description: a very simple merlin workflow # Property - description + + global.parameters: # Block - global.parameters + GREET: # Property - GREET + values : ["hello","hola"] # Property - values + label : GREET.%% # Property - label + WORLD: # Property - WORLD + values : ["world","mundo"] # Property - values + label : WORLD.%% # Property - label + ``` + +In order to conform to Maestro's verification format introduced in Maestro v1.1.7, we now use [json schema](https://json-schema.org/) validation to verify our spec file. + +If you are adding a new feature to Merlin that requires a new block within the yaml spec file or a new property within a block, then you are going to need to update the `merlinspec.json` file located in the `merlin/spec/` directory. You also may want to add additional verifications within the `specification.py` file located in the same directory. + +!!! note + + If you add custom verifications beyond the pattern checking that the json schema checks for, then you should also add tests for this verification in the `test_specification.py` file located in the `merlin/tests/unit/spec/` directory. + +#### Adding a New Property + +To add a new property to a block in the yaml file, you need to create a template for that property and place it in the correct block in `merlinspec.json`. + +!!! tip + + For help with json schema formatting, check out the [step-by-step getting started guide](https://json-schema.org/learn/getting-started-step-by-step.html). + +!!! example + + Say I wanted to add a new property called `example` that's an integer within the `description` block. I would modify the `description` block in the `merlinspec.json` file to look like so: + + ```json title="merlinspec.json" hl_lines="7" + { + "DESCRIPTION": { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1}, + "description": {"type": "string", "minLength": 1}, + "example": {"type": "integer", "minimum": 1} + }, + "required": ["name", "description"] + } + . + . + . + } + ``` + +That's all that's required of adding a new property. If you want to add your own custom verifications make sure to create [unit tests](#testing) for them. + +#### Adding a New Block + +Adding a new block is slightly more complicated than adding a new property. You will not only have to update the `merlinspec.json` schema file but also add calls to verify that block within `specification.py`. + +To add a block to the json schema, you will need to define the template for that entire block. + +!!! tip + + For help with json schema formatting, check out the [step-by-step getting started guide](https://json-schema.org/learn/getting-started-step-by-step.html). + +!!! example + + Say I wanted to create a block called `country` with two properties labeled `name` and `population` that are both required. It would look like so: + + ```json title="merlinspec.json" + { + . + . + . + "COUNTRY": { + "type": "object", + "properties": { + "name": {"type": "string", "minLength": 1}, + "population": { + "anyOf": [ + {"type": "string", "minLength": 1}, + {"type": "integer", "minimum": 1} + ] + } + }, + "required": ["name", "capital"] + } + } + ``` + + Here, `name` can only be a string but `population` can be both a string and an integer. + +The next step is to enable this block in the schema validation process of `specification.py`. To do this we need to: + +1. Create a new method called `verify_()` within the `MerlinSpec` class +2. Call the `YAMLSpecification.validate_schema()` method provided to us via [Maestro](https://github.com/LLNL/maestrowf/blob/develop/maestrowf/specification/yamlspecification.py#L400) in your new method +3. Add a call to `verify_()` inside the `verify()` method + +If you add your own custom verifications on top of this, please add [unit tests](#testing) for them. diff --git a/docs/user_guide/docker.md b/docs/user_guide/docker.md new file mode 100644 index 000000000..4b4789dea --- /dev/null +++ b/docs/user_guide/docker.md @@ -0,0 +1,332 @@ +# Docker + +Merlin has a simple Dockerfile description for running a container with all requirements installed. + +## Build the Container + +The docker container can be built by building in the top level Merlin directory. + +```bash +docker build -t merlin . +``` + +This will create a `merlin:latest` image in your docker image collection with a user "merlinu" and a WORKDIR set to `/home/merlinu`. + +```bash +docker images +``` + +## Run the Container + +Here we'll discuss: + +- Starting a broker and results backend server +- Pulling and running Merlin +- Linking Merlin to the broker and results backend server +- Aliasing the docker `merlin` and `celery` commands +- Running an example + +### Starting a Broker/Results Server + +Before we can run Merlin we first need to start servers for the broker and results backend (see [Why is Configuration Necessary?](./configuration/index.md#why-is-configuration-necessary)). The broker server can be either Redis or RabbitMQ. For this demonstration a Redis server will be used. The results backend will always be a Redis server. + +Pull the Redis image: + +```bash +docker pull redis +``` + +Name and run the Redis container: + +```bash +docker run -d -p 6379:6379 --name my-redis redis +``` + +Our Redis server that we'll use for both the broker and results backend is now running. + +### Starting Merlin + +Now that our Redis server is set up, all that's left is to start Merlin and link the server. + +First, let's create a local working directory: + +```bash +mkdir $HOME/merlinu ; cd $HOME/merlinu +``` + +Next, pull the Merlin image: + +```bash +docker pull llnl/merlin +``` + +!!! tip + + A shell can be started in the container by using the `--entrypoint` command. If the user would like to examine the container contents, they can use a shell as the entry point. + + ```bash + docker run --rm -ti --volume "$HOME/merlinu":/home/merlinu --entrypoint="/bin/bash" merlin + ``` + +Now we'll run Merlin in detached mode to provide both the `merlin` and `celery` commands. Here we'll also link the Redis server we started with the `--link` option and provide a local output directory with the `--volume` argument (it's recommended that a fixed directory be used for `--volume`). + +```bash +docker run --rm -td --name my-merlin --link my-redis --volume "$HOME/merlinu":/home/merlinu llnl/merlin +``` + +To finish linking the server, edit the `$HOME/merlinu/.merlin/app.yaml` so that the broker and results backend variables are `my-redis`. + +Finally, we can alias the docker commands for `merlin` and `celery` to make things easier for us: + +```bash +alias merlin="docker exec my-merlin merlin" ; alias celery="docker exec my-merlin celery" +``` + +Congratulations, Merlin is now running! + +### Running an Example + +To test that Merlin is running properly, we can grab a [built-in Merlin example](../examples/index.md) and try running it. For this demonstration we'll get the [feature_demo example](../examples/feature_demo.md) with: + +```bash +merlin example feature_demo +``` + +We can first do a dry run without workers: + +```bash +merlin run feature_demo/feature_demo.yaml --dry --local +``` + +If this ran successfully you should see an output directory named `studies/feature_demo_`. Then, running + +```bash +tree studies/feature_demo_ +``` + +...should provide the following output: + +???+ success + + ```bash + studies/feature_demo_/ + ├── collect + │ └── X2.0.5 + │ └── collect_X2.0.5.sh + ├── hello + │ └── X2.0.5 + │ ├── 00 + │ │ └── hello_X2.0.5.sh + │ ├── 01 + │ │ └── hello_X2.0.5.sh + │ ├── 02 + │ │ └── hello_X2.0.5.sh + │ ├── 03 + │ │ └── hello_X2.0.5.sh + │ ├── 04 + │ │ └── hello_X2.0.5.sh + │ ├── 05 + │ │ └── hello_X2.0.5.sh + │ ├── 06 + │ │ └── hello_X2.0.5.sh + │ ├── 07 + │ │ └── hello_X2.0.5.sh + │ ├── 08 + │ │ └── hello_X2.0.5.sh + │ └── 09 + │ └── hello_X2.0.5.sh + ├── learn + │ └── X2.0.5 + │ └── learn_X2.0.5.sh + ├── make_new_samples + │ └── N_NEW.10 + │ └── make_new_samples_N_NEW.10.sh + ├── merlin_info + │ ├── cmd.err + │ ├── cmd.out + │ ├── cmd.sh + │ ├── feature_demo.expanded.yaml + │ ├── feature_demo.orig.yaml + │ ├── feature_demo.partial.yaml + │ ├── samples.npy + │ └── scripts + │ ├── features.json + │ ├── hello_world.py + │ ├── pgen.py + │ └── __pycache__ + │ └── pgen.cpython-310.pyc + ├── predict + │ └── N_NEW.10.X2.0.5 + │ └── predict_N_NEW.10.X2.0.5.sh + ├── python2_hello + │ └── X2.0.5 + │ └── python2_hello_X2.0.5.sh + ├── python3_hello + │ └── X2.0.5 + │ └── python3_hello_X2.0.5.sh + ├── translate + │ └── X2.0.5 + │ └── translate_X2.0.5.sh + └── verify + └── N_NEW.10.X2.0.5 + └── verify_N_NEW.10.X2.0.5.sh + ``` + +Now that we know a dry run works properly, we can try a real run with workers. To do this, run the following two commands ([`merlin run`](./command_line.md#run-merlin-run) and [`merlin run-workers`](./command_line.md#run-workers-merlin-run-workers)) in any order you choose: + +=== "Queue Tasks" + + Define the tasks and load them on the broker with: + + ```bash + merlin run feature_demo/feature_demo.yaml + ``` + +=== "Start Workers" + + Start workers to pull tasks from the server and run them in the container with: + + ```bash + merlin run-workers feature_demo/feature_demo.yaml + ``` + +Once all tasks are done processing, running the tree command on the workspace should show: + +???+ success + + ```bash + studies/feature_demo_20240108-091708/ + ├── collect + │ └── X2.0.5 + │ ├── collect_X2.0.5.err + │ ├── collect_X2.0.5.out + │ ├── collect_X2.0.5.sh + │ ├── files_to_collect.txt + │ ├── MERLIN_FINISHED + │ └── results.json + ├── hello + │ └── X2.0.5 + │ ├── 00 + │ │ ├── hello_world_output_0.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 01 + │ │ ├── hello_world_output_1.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 02 + │ │ ├── hello_world_output_2.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 03 + │ │ ├── hello_world_output_3.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 04 + │ │ ├── hello_world_output_4.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 05 + │ │ ├── hello_world_output_5.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 06 + │ │ ├── hello_world_output_6.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 07 + │ │ ├── hello_world_output_7.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ ├── 08 + │ │ ├── hello_world_output_8.json + │ │ ├── hello_X2.0.5.err + │ │ ├── hello_X2.0.5.out + │ │ ├── hello_X2.0.5.sh + │ │ └── MERLIN_FINISHED + │ └── 09 + │ ├── hello_world_output_9.json + │ ├── hello_X2.0.5.err + │ ├── hello_X2.0.5.out + │ ├── hello_X2.0.5.sh + │ └── MERLIN_FINISHED + ├── learn + │ └── X2.0.5 + │ ├── learn_X2.0.5.err + │ ├── learn_X2.0.5.out + │ ├── learn_X2.0.5.sh + │ ├── MERLIN_FINISHED + │ └── random_forest_reg.pkl + ├── make_new_samples + │ └── N_NEW.10 + │ ├── grid_10.npy + │ ├── make_new_samples_N_NEW.10.err + │ ├── make_new_samples_N_NEW.10.out + │ ├── make_new_samples_N_NEW.10.sh + │ └── MERLIN_FINISHED + ├── merlin_info + │ ├── cmd.err + │ ├── cmd.out + │ ├── cmd.sh + │ ├── feature_demo.expanded.yaml + │ ├── feature_demo.orig.yaml + │ ├── feature_demo.partial.yaml + │ ├── samples.npy + │ └── scripts + │ ├── features.json + │ ├── hello_world.py + │ ├── pgen.py + │ └── __pycache__ + │ └── pgen.cpython-310.pyc + ├── predict + │ └── N_NEW.10.X2.0.5 + │ ├── MERLIN_FINISHED + │ ├── prediction_10.npy + │ ├── predict_N_NEW.10.X2.0.5.err + │ ├── predict_N_NEW.10.X2.0.5.out + │ └── predict_N_NEW.10.X2.0.5.sh + ├── python2_hello + │ └── X2.0.5 + │ ├── MERLIN_FINISHED + │ ├── python2_hello_X2.0.5.err + │ ├── python2_hello_X2.0.5.out + │ └── python2_hello_X2.0.5.sh + ├── python3_hello + │ └── X2.0.5 + │ ├── MERLIN_FINISHED + │ ├── python3_hello_X2.0.5.err + │ ├── python3_hello_X2.0.5.out + │ └── python3_hello_X2.0.5.sh + ├── translate + │ └── X2.0.5 + │ ├── MERLIN_FINISHED + │ ├── results.npz + │ ├── translate_X2.0.5.err + │ ├── translate_X2.0.5.out + │ └── translate_X2.0.5.sh + └── verify + └── N_NEW.10.X2.0.5 + ├── FINISHED + ├── MERLIN_FINISHED + ├── verify_N_NEW.10.X2.0.5.err + ├── verify_N_NEW.10.X2.0.5.out + └── verify_N_NEW.10.X2.0.5.sh + ``` + +For more information on what's going on in this example, see the [Feature Demo Example page](../examples/feature_demo.md). diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md new file mode 100644 index 000000000..990ab1424 --- /dev/null +++ b/docs/user_guide/index.md @@ -0,0 +1,23 @@ +# Merlin User Guide + +## What is Merlin? + +Merlin is a distributed task queuing system, designed to allow complex HPC workflows to scale to large numbers of simulations (we've done 100 Million on the Sierra Supercomputer). + +Why would you want to run that many simulations? To become your own Big Data generator. + +Data sets of this size can be large enough to train deep neural networks that can mimic your HPC application, to be used for such things as design optimization, uncertainty quantification and statistical experimental inference. Merlin's been used to study inertial confinement fusion, extreme ultraviolet light generation, structural mechanics and atomic physics, to name a few. + +## How Does Merlin Work? + +In essence, Merlin coordinates complex workflows through a persistent external queue server that lives outside of your HPC systems, but that can talk to nodes on your cluster(s). As jobs spin up across your ecosystem, workers on those allocations pull work from a central server, which coordinates the task dependencies for your workflow. Since this coordination is done via direct connections to the workers (i.e. not through a file system), your workflow can scale to very large numbers of workers, which means a very large number of simulations with very little overhead. + +Furthermore, since the workers pull their instructions from the central server, you can do a lot of other neat things, like having multiple batch allocations contribute to the same work (think surge computing), or specialize workers to different machines (think CPU workers for your application and GPU workers that train your neural network). Another neat feature is that these workers can add more work back to central server, which enables a variety of dynamic workflows, such as may be necessary for the intelligent sampling of design spaces or reinforcement learning tasks. + +Merlin does all of this by leveraging some key HPC and cloud computing technologies, building off open source components. It uses [Maestro](https://maestrowf.readthedocs.io/) to provide an interface for describing workflows, as well as for defining workflow task dependencies. It translates those dependencies into concrete tasks via [Celery](https://docs.celeryproject.org/), which can be configured for a variety of backend technologies ([RabbitMQ](https://www.rabbitmq.com) and [Redis](https://redis.io) are currently supported). Although not a hard dependency, we encourage the use of [Flux](http://flux-framework.org) for interfacing with HPC batch systems, since it can scale to a very large number of jobs. + +The integrated system looks a little something like this: + +![Merlin Architecture](../assets/images/merlin_arch.png) + +For more details, check out the rest of the user guide. diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md new file mode 100644 index 000000000..d73d51db5 --- /dev/null +++ b/docs/user_guide/installation.md @@ -0,0 +1,271 @@ +# Installation + +The Merlin library can be installed by using [virtual environments and pip](#installing-with-pip) or [spack](#installing-with-spack). + +Contributors to Merlin should follow the [Developer Setup](#developer-setup) below. + +Once Merlin has been installed, the installation needs to be configured. See the [Configuration](./configuration/index.md) page for instructions on how to configure Merlin. + +## Installing With Virtual Environments & Pip + +The most common way to install Merlin is via pip. To accomplish this method of installation we'll need to: + +1. Set up and activate a virtual environment +2. Install Merlin with pip + +### Creating A Virtual Environment + +We'll be creating a Python [virtual environment](https://virtualenv.pypa.io/en/stable/) in this section. To do this, run: + +```bash +python3 -m venv venv +``` + +!!! warning + + A virtual environment will need to be created for each system type. It's recommended to name the virtual environment `venv_` to make it easier to switch between them. This documentation will use `venv` for simplicity to reference the virtual environment. + +!!! tip + + Virtual environments provide an isolated environment for working on Python projects to avoid dependency conflicts. + +### Activating A Virtual Environment + +Once the virtual environment is created it can be activated like so: + +=== "bash" + + ```bash + source venv/bin/activate + ``` + +=== "csh" + + ```csh + source venv/bin/activate.csh + ``` + +This will set the Python and Pip path to the virtual environment at `venv/bin/python` and `venv/bin/pip` respectively. + +The virtual environment name should now display in the terminal, which means it is active. Any calls to pip will install to the virtual environment. + +!!! tip + + To verify that Python and Pip are pointing to the virtual environment, run + + ```bash + which python pip + ``` + +### Pip Installing Merlin + +Ensure your virtual environment is activated. Once it is, Merlin can be installed using pip: + +```bash +pip3 install merlin +``` + +Verify that Merlin installed properly by ensuring the following command can run: + +```bash +merlin --version +``` + +All set up? See the [Configuration](./configuration/index.md) page for instructions on how to configure Merlin or check out the [Tutorial](../tutorial/index.md)! + +More information on Merlin commands can be found at the [Command Line Interface](command_line.md) page. + +### Deactivating A Virtual Environment + +Virtualenvs can be exited via the following: + +```bash +deactivate +``` + +## Installing With Spack + +The virtualenv method is not the only method to install Merlin in a separate Python install. The Spack method will build Python and all required modules for a specific set of configuration options. These options include the compiler version, system type and Python version. Merlin will then be installed in this specific version allowing for multiple Python versions on a single system without the need for a virtualenv. The py-merlin package builds with python3.6+. + +### Checkout Spack + +Get the latest version of Spack from Github. This is independent from Merlin so make sure Merlin and Spack are in separate directories. + +```bash +git clone https://github.com/spack/spack.git +``` + +Move into the Spack directory: + +```bash +cd spack/ +``` + +The Merlin Spack package is in the develop branch so checkout that branch: + +```bash +git checkout develop +``` + +### Setup Spack + +Source the `setup-env.sh` or `setup-env.csh`. This will put Spack in your path and setup module access for later use. This should be done every time the modules are used. + +```bash +source ./share/spack/setup-env.sh +``` + +Add compilers if you haven't already: + +```bash +spack compiler add +``` + +To see the compilers: + +```bash +spack compiler list +``` + +### Build Merlin + +Merlin can be built with the default compiler which, in general, is the newest gcc compiler, or the compiler can be specified. In the "Specified Compiler Install" example below, we're specifying gcc v7.1.0. Additionally, a different Python version can be specified as part of the package config. The "Python Version Install" example below shows how to build Merlin with Python v3.6.8: + +=== "Default Compiler Install" + + ```bash + spack install py-merlin + ``` + +=== "Specified Compiler Install" + + ```bash + spack install py-merlin%gcc@7.1.0 + ``` + +=== "Python Version Install" + + ```bash + spack install py-merlin^python@3.6.8 + ``` + +Building Merlin will take a *long* time, be prepared to wait. It will build Python and all Python modules Merlin needs including numpy. + +A tree of all of the packages and their dependencies needed to build the Merlin package can be shown by using the spec keyword. + +```bash +spack spec py-merlin +``` + +### Activate Merlin + +To use Merlin you can activate the module. + +=== "Default Activation" + + ```bash + spack activate py-merlin + ``` + +=== "Specified Compiler Activation" + + ```bash + spack activate py-merlin%gcc@7.1.0 + ``` + +=== "Python Version Activation" + + ```bash + spack activate py-merlin^python@3.6.8 + ``` + +### Load Python + +The associated Python module can then be loaded into your environment, this will only work if you have sourced the `setup-env.sh` or `setup-env.csh`. + +```bash +module avail python +``` + +This will give you a list, the spack version will have a long hash associated with the name. + +!!! example + + ```bash + ------ /spack/share/spack/modules/linux-rhel7-x86_64 ------- + python-3.6.8-gcc-8.1.0-4ilk3kn (L) + ``` + +Now all that's left to do is select which Python to load: + +```bash +module load python-3.6.8-- +``` + +Using the example output above, we could choose to load Python like so: + +!!! example + + ```bash + module load python-3.6.8-gcc-8.1.0-4ilk3kn + ``` + +At this point the module specific Python, Merlin, Maestro and Celery will all be in your path. + +Congratulations, you're ready to use Merlin! See the [Configuration](./configuration/index.md) page for instructions on how to configure Merlin or check out the [Tutorial](../tutorial/index.md)! + +More information on Merlin commands can be found at the [Command Line Interface](command_line.md) page. + +## Developer Setup + +The developer setup can be done via pip or via make. This section will cover how to do both. + +Additionally, there is an alternative method to setup Merlin on supercomputers. See the [Spack](#installing-with-spack) section above for more details. + +### Make Setup + +Visit the [Merlin repository](https://github.com/LLNL/merlin/) on github. [Create a fork of the repo](https://github.com/LLNL/merlin/fork) and [clone it](https://docs.github.com/en/get-started/quickstart/fork-a-repo#cloning-your-forked-repository) onto your system. + +Change directories into the Merlin repo: + +```bash +cd merlin/ +``` + +Install Merlin with the developer dependencies: + +```bash +make install-dev +``` + +This will create a virtualenv, start it, and install Merlin and it's dependencies for you. + +We can make sure it's installed by running: + +```bash +merlin --version +``` + +If you don't see a version number, you may need to restart your virtualenv and try again. + +All set up? See the [Configuration](./configuration/index.md) page for instructions on how to configure Merlin or check out the [Tutorial](../tutorial/index.md)! + +More information on Merlin commands can be found at the [Command Line Interface](command_line.md) page. + +### Pip Setup + +[Create a virtual environment](#creating-a-virtual-environment) and [activate it](#activating-a-virtual-environment), then install with the additional developer dependencies: + +=== "GitHub" + ```bash + pip3 install -e "git+https://github.com/LLNL/merlin.git@develop#egg=merlin[dev]" + ``` + +=== "PyPi" + ```bash + pip3 install "merlin[dev]" + ``` + +All set up? See the [Configuration](./configuration/index.md) page for instructions on how to configure Merlin or check out the [Tutorial](../tutorial/index.md)! + +More information on Merlin commands can be found at the [Command Line Interface](command_line.md) page. diff --git a/docs/user_guide/interpreting_output.md b/docs/user_guide/interpreting_output.md new file mode 100644 index 000000000..a60e5e490 --- /dev/null +++ b/docs/user_guide/interpreting_output.md @@ -0,0 +1,1072 @@ +# Interpreting Output + +[Running Studies](./running_studies.md) with Merlin results in output workspaces being generated. This module is intended to help users understand the layout of the output workspace and the files contained within. + +## The Basics + +When a study is ran using Merlin, an output workspace is created. This workpsace is named after the `name` property in [the `description` block](./specification.md#the-description-block) of your spec file and will contain a timestamp of the run. + +!!! example + + Say I have a spec file with a `description` block like so: + + ```yaml + description: + name: name_of_my_study + description: This is an example showcasing naming conventions of output workspaces + ``` + + After running this study with [`merlin run`](./command_line.md#run-merlin-run), the output workspace would be named `name_of_my_study_`. Here, `` is a date of the form `YYYYMMDD-HHMMSS`. For example, it could be named `name_of_my_study_20240117-151836`. + +The output workspace will hold information about what was executed in your study and the output generated from it. This workspace is intended to help maintain workflow provenance and to provide users with an easy-to-navigate directory containing all of the results from their study. + +In this section we'll cover the basics of the output that Merlin generates from studies that are ran. The two main topics for the basics are [the `merlin_info` subdirectory](#the-merlin_info-subdirectory) and [step workspaces](#step-workspaces). + +### The `merlin_info` Subdirectory + +The `merlin_info/` subdirectory is primarily used for workflow provenance. It's in this directory that you can track the exact specification file that was submitted and how variables were substituted. + +You'll find three different versions of your specification file in the `merlin_info/` subdirectory: + +1. `.orig.yaml`: An exact copy of the spec that was ran with `merlin run` +2. `.partial.yaml`: A copy of the original spec plus all of the default values for each block that Merlin will fill in for you if omitted. This spec will also show any changes to variables submitted via [Command Line Substitution](./running_studies.md#command-line-substitution) +3. `.expanded.yaml`: The entire spec with all variables expanded and default values filled in + +A visual representation of a basic `merlin_info/` subdirectory is shown below. + +
+ ![basic merlin_info subdirectory](../assets/images/interpreting_output/basic-merlin-info-workspace.png) +
A Basic "merlin_info/" Subdirectory
+
+ +??? example "Example `merlin_info/` Contents" + + Say we have a script `say_hello.py` that says hello in a few different languages: + + ```python title="say_hello.py" + import argparse + + GREETINGS = { + "english": "Hi", + "spanish": "Hola", + "french": "Bonjour", + "german": "Hallo", + # Add more languages and greetings if you want + } + + def greet_in_language(language: str, name: str): + """ + Try to greet `name` in the language provided by `language`. + If the language is not supported, raise a KeyError. + + :param language: The language to say hello in + :param name: The name to say hello to + """ + # Make sure the name provided is supported by our script + try: + # Print the greeting with the provided name + print(f"{GREETINGS[language.lower()]}, {name}!") + except KeyError as exc: + raise KeyError(f"The language '{language.lower()}' is not yet supported by this script. Please choose one of the following languages instead: {GREETINGS.keys()}") from exc + + def main(): + parser = argparse.ArgumentParser(description="Greet someone in a specified language.") + parser.add_argument("--language", help="Specify the language for the greeting") + parser.add_argument("--name", help="Specify the name to greet") + + args = parser.parse_args() + + greet_in_language(args.language, args.name) + + if __name__ == "__main__": + main() + ``` + + We can then create a small spec file `output_workspace_demo.yaml` as a sibling file to `say_hello.py`. This spec file will look like so: + + ```yaml title="output_workspace_demo.yaml" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + + study: + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + # Here, "English" and "Kyle" are randomly selected parameter values + python3 $(SPECROOT)/say_hello.py --language English --name Kyle + ``` + + Running this study with: + + ```bash + merlin run --local output_workspace_demo.yaml + ``` + + ...will create an output workspace of the form: + + ```bash + studies/output_workspace_demo_/ + ├── merlin_info + │ ├── output_workspace_demo.expanded.yaml + │ ├── output_workspace_demo.orig.yaml + │ └── output_workspace_demo.partial.yaml + └── say_hello + ├── MERLIN_FINISHED + ├── say_hello.err + ├── say_hello.out + └── say_hello.sh + ``` + + We'll discuss the `say_hello/` directory in [Step Workspaces](#step-workspaces) so for now let's take a look at each of the yaml files in the `merlin_info/` directory. + + === "Orig" + + As we can see, the `.orig.yaml` file is an *exact* copy of the `output_workspace_demo.yaml` spec file. + + ```yaml title="output_workspace_demo.orig.yaml" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + + study: + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + # Here, "English" and "Kyle" are randomly selected parameter values + python3 $(SPECROOT)/say_hello.py --language English --name Kyle + ``` + + === "Partial" + + In the `.partial.yaml` file we'll see that there are blocks and properties that we didn't initially provide in our original spec file. These are default values that Merlin automatically substitutes into your workflow if you omit them. + + ```yaml title="output_workspace_demo.partial.yaml" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + batch: + type: local + dry_run: false + shell: /bin/bash + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + + sources: + + labels: + + dependencies: + + study: + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + # Here, "English" and "Kyle" are randomly selected parameter values + python3 $(SPECROOT)/say_hello.py --language English --name Kyle + task_queue: merlin + shell: /bin/bash + max_retries: 30 + + global.parameters: + + merlin: + resources: + task_server: celery + overlap: false + workers: + default_worker: + steps: [all] + nodes: + batch: + + samples: + + user: + ``` + + === "Expanded" + + In the `.expanded.yaml` file we'll see similar behavior to the `.partial.yaml` file where default values are substituted in. + + We'll also see the expansion of all variables. For example, we no longer have `$(STUDY_NAME)` as the name of the study; instead it's the substituted variable from the `env` block. Similarly, notice that we no longer see `$(SPECROOT)` in the command given to the `say_hello` step; instead we see the fully expanded path that `SPECROOT` represents (see [Reserved Variables](./variables.md#reserved-variables) for more information on this). + + ```yaml title="output_workspace_demo.expanded.yaml" hl_lines="2 27" + description: + name: output_workspace_demo + description: An example showcasing the structure and contents of the output workspace + + batch: + type: local + dry_run: false + shell: /bin/bash + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + + sources: + + labels: + + dependencies: + + study: + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + # Here, "English" and "Kyle" are randomly selected parameter values + python3 /path/to/say_hello.py --language English --name Kyle + task_queue: merlin + shell: /bin/bash + max_retries: 30 + + global.parameters: + + merlin: + resources: + task_server: celery + overlap: false + workers: + default_worker: + steps: [all] + nodes: + batch: + + samples: + + user: + ``` + +In addition to the files mentioned above, it can be useful to copy the scripts that will be used in your workflow to the `merlin_info/` subdirectory. This helps ensure that everything used for the execution of a workflow remains in one workspace. + +??? example "Copying Scripts to `merlin_info/`" + + Let's use the same `say_hello` example as in "Example `merlin_info/` Contents". However, this time we'll add a step to copy scripts to the `merlin_info/` directory: + + ```yaml title="output_workspace_demo.yaml" hl_lines="9 12-17 23 24" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts # (1) + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | # (2) + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SCRIPTS) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | # (3) + # Here, "English" and "Kyle" are randomly selected parameter values + python3 $(SCRIPTS)/say_hello.py --language English --name Kyle + depends: [copy_scripts] # (4) + ``` + + 1. Create a variable to point to where we'll copy our scripts. The `$(MERLIN_INFO)` variable is a [Reserved Variable](./variables.md#reserved-variables) that points to the `merlin_info/` subdirectory for the current run of the workflow. + 2. In this `copy_scripts` step we're creating the `merlin_info/scripts/` directory if it doesn't already exist and then copying our `say_hello.py` script there. + 3. In this command we modify the path to `say_hello.py` from `$(SPECROOT)/say_hello.py` to `$(SCRIPTS)/say_hello.py` for better workflow provenance. + 4. Since we need to wait for the `say_hello.py` script to exist in the `$(SCRIPTS)` path, this step now depends on the `copy_scripts` step. + + Running this study with: + + ```bash + merlin run --local output_workspace_demo.yaml + ``` + + ...provides us with an output workspace of the form: + + ```bash + studies/output_workspace_demo_20240118-111426/ + ├── copy_scripts + │ ├── copy_scripts.err + │ ├── copy_scripts.out + │ ├── copy_scripts.sh + │ └── MERLIN_FINISHED + ├── merlin_info + │ ├── output_workspace_demo.expanded.yaml + │ ├── output_workspace_demo.orig.yaml + │ ├── output_workspace_demo.partial.yaml + │ └── scripts + │ └── say_hello.py + └── say_hello + ├── MERLIN_FINISHED + ├── say_hello.err + ├── say_hello.out + └── say_hello.sh + ``` + + What we should take away here is the addition of the `scripts/say_hello.py` folder in the `merlin_info/` directory. With our scripts here we now have everything that was ran in this study in one single output workspace. + + We'll discuss the contents of `copy_scripts` and `say_hello` in the [Step Workspaces](#step-workspaces) section. + +The `merlin_info/` subdirectory is also used for tracking the generation of [Samples](./specification.md#samples) in your workflow (if any are used). The files created for sample generation will be discussed further in the [Output With Samples](#output-with-samples) section. + +### Step Workspaces + +In addition to [The `merlin_info/` Subdirectory](#the-merlin_info-subdirectory), when a study is ran a step workspace is created for each step in your workflow. These step workspaces will use the `name` properties of your steps as the names of the workspaces. + +Step workspaces will *always* contain a `.sh` file containing the command to run for the step. This file will be generated based on the `run` property of [the `study` block](./specification.md#the-study-block). + +In addition to the `.sh` file, three more files will appear for runs that aren't [Dry Runs](./running_studies.md#dry-runs): + +1. `.out`: contains the stdout generated from executing the `.sh` file. +2. `.err`: contains the stderr generated from executing the `.sh` file. This is one of the most useful places to look for debugging failures in your workflow. +3. `MERLIN_FINISHED`: an empty file to help Merlin mark that the step is complete. + + - If there were any errors while executing the `.sh` file, this file will not appear in the step's workspace. + - The presence of this file in a step workspace directory is useful when it comes to [Restarting Workflows](./running_studies.md#restarting-workflows). + +A visual representation of a basic step workspace is shown below. + +
+ ![basic step workspace](../assets/images/interpreting_output/basic-step-workspace.png) +
A Basic Step Workspace Hierarchy
+
+ +??? example "Exploring an Example Step Workspace" + + Let's use the `say_hello` spec from the "Copying Scripts to `merlin_info/`" example in [The `merlin_info/` subdirectory](#the-merlin_info-subdirectory) section to demonstrate what's contained in a step workspace. Here's the spec: + + ```yaml title="output_workspace_demo.yaml" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SCRIPTS) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + # Here, "English" and "Kyle" are randomly selected parameter values + python3 $(SCRIPTS)/say_hello.py --language English --name Kyle + depends: [copy_scripts] + ``` + + Running this study with: + + ```bash + merlin run --local output_workspace_demo.yaml + ``` + + ...provides us with an output workspace of the form: + + ```bash + studies/output_workspace_demo_20240118-111426/ + ├── copy_scripts + │ ├── copy_scripts.err + │ ├── copy_scripts.out + │ ├── copy_scripts.sh + │ └── MERLIN_FINISHED + ├── merlin_info + │ ├── output_workspace_demo.expanded.yaml + │ ├── output_workspace_demo.orig.yaml + │ ├── output_workspace_demo.partial.yaml + │ └── scripts + │ └── say_hello.py + └── say_hello + ├── MERLIN_FINISHED + ├── say_hello.err + ├── say_hello.out + └── say_hello.sh + ``` + + Here we'll focus on the `say_hello` step workspace. The `copy_scripts` workspace will be very similar but should have nothing in the `.out` or `.err` files as nothing was printed to stdout or stderr in that step. + + === "say_hello.sh" + + If we look at the `say_hello.sh` file, it should look similar to this: + + ```bash title="say_hello.sh" + #!/bin/bash + + # Here, "English" and "Kyle" are randomly selected parameter values + python3 /path/to/studies/output_workspace_demo_/merlin_info/scripts/say_hello.py --language English --name Kyle + ``` + + The first line will specify the shell that we're running in. After that, the script is a direct copy from the `cmd` of the `say_hello` step in our spec file. + + This is exactly what is executed when this step is ran. + + === "say_hello.out" + + Our `say_hello.py` script will print a message to stdout. Merlin will capture this output and store it here in the `say_hello.out` file. We can see this by viewing the contents of this file: + + ```bash title="say_hello.out" + Hi, Kyle! + ``` + + === "say_hello.err" + + For this example, the `say_hello.err` file should be empty since nothing should've been printed to stderr. However, if we were to re-run this study with a language that our script doesn't support, like Swedish, then the `say_hello.err` file would show us the error that occurred: + + ```bash title="say_hello.err" + Traceback (most recent call last): + File "/path/to/studies/output_workspace_demo_/merlin_info/scripts/say_hello.py", line 22, in greet_in_language + print(f"{GREETINGS[language.lower()]}, {name}!") + KeyError: 'swedish' + + The above exception was the direct cause of the following exception: + + Traceback (most recent call last): + File "/path/to/studies/output_workspace_demo_/merlin_info/scripts/say_hello.py", line 36, in + main() + File "/path/to/studies/output_workspace_demo_/merlin_info/scripts/say_hello.py", line 33, in main + greet_in_language(args.language, args.name) + File "/path/to/studies/output_workspace_demo_/merlin_info/scripts/say_hello.py", line 24, in greet_in_language + raise KeyError(f"The language '{language.lower()}' is not yet supported by this script. Please choose one of the following languages instead: {GREETINGS.keys()}") from exc + KeyError: "The language 'swedish' is not yet supported by this script. Please choose one of the following languages instead: dict_keys(['english', 'spanish', 'french', 'german'])" + ``` + + As we can see, the full traceback is displayed in the `.err` file here. This can be very useful when debugging your workflow. + + !!! note + + If your study raised a KeyError like this, you likely will *not* have a `MERLIN_FINISHED` file in your step output workspace since your step will have returned with a `MERLIN_SOFT_FAIL` [Return Code](./variables.md#step-return-variables). + + === "MERLIN_FINISHED" + + The `MERLIN_FINISHED` file will be present if the step finished executing with no failures. This file will always be empty and just serves as a way for Merlin to know that a step has completed. + +For best practice, if your step needs to output any files you should utilize the `$(WORKSPACE)` [Reserved Variable](./variables.md#reserved-variables) in order to have it write the output to the step's output workspace. + +??? example "Utilizing `$(WORKSPACE)` for File Management" + + Let's say we modify the `say_hello` script from the previous examples in this module to write the output to a file instead of printing it to stdout: + + ```python title="say_hello.py" hl_lines="11 22-24 32" + import argparse + + GREETINGS = { + "english": "Hi", + "spanish": "Hola", + "french": "Bonjour", + "german": "Hallo", + # Add more languages and greetings if you want + } + + def greet_in_language(language: str, name: str, outfile: str): + """ + Greet `name` in the language provided by `language` and write the output + to an outfile. If the language is not supported, raise a KeyError. + + :param language: The language to say hello in + :param name: The name to say hello to + :param outfile: The name of the file to write the greeting to + """ + # Make sure the name provided is supported by our script + try: + # Write the greeting with the provided name to an output file + with open(outfile, "w") as fp: + fp.write(f"{GREETINGS[language.lower()]}, {name}!") + except KeyError as exc: + raise KeyError(f"The language '{language.lower()}' is not yet supported by this script. Please choose one of the following languages instead: {GREETINGS.keys()}") from exc + + def main(): + parser = argparse.ArgumentParser(description="Greet someone in a specified language.") + parser.add_argument("--language", help="Specify the language for the greeting") + parser.add_argument("--name", help="Specify the name to greet") + parser.add_argument("--outfile", help="The file to write the hello sentence to") + + args = parser.parse_args() + + greet_in_language(args.language, args.name, args.outfile) + + if __name__ == "__main__": + main() + ``` + + Now let's modify our spec file to accommodate this change and write the output to the step's workspace using the `$(WORKSPACE)` key word: + + ```yaml title="output_workspace_demo.yaml" hl_lines="24" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SCRIPTS) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + # Here, "English" and "Kyle" are randomly selected parameter values + python3 $(SCRIPTS)/say_hello.py --language English --name Kyle --outfile $(WORKSPACE)/hello_output.txt + depends: [copy_scripts] + ``` + + If we run this study and look at the `say_hello` workspace, we'll see it now has an additional file `hello_output.txt`: + + ```bash + studies/output_workspace_demo_/say_hello/ + ├── hello_output.txt + ├── MERLIN_FINISHED + ├── say_hello.err + ├── say_hello.out + └── say_hello.sh + ``` + + This new file will contain the same sentence "Hi, Kyle!" that was previously output in `say_hello.out`. + +## Output With Parameters + +If you introduce parameter sets into your workflow then any steps that use parameters will have a slightly different output workspace format than steps without parameters. For each parameter set a subdirectory in the step's workspace is created. + +Parameter set subdirectories will use the `label` property from [the `global.parameters` block](./specification.md#the-globalparameters-block) to assist with the naming of the subdirectory. For each parameter used, the `label` value of each will be concatenated with a `.` symbol. This concatenated value will be appended to the step's name. + +In each parameter set subdirectory there will still be the typical `.sh`, `.out`, `.err`, and `MERLIN_FINISHED` files. + +??? example "Example Parameter Set Subdirectory Structure" + + Let's modify the `say_hello` example from the previous section to introduce parameters: + + ```yaml title="output_workspace_demo.yaml" hl_lines="11-17 31" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts + + global.parameters: + LANGUAGE: + values: ["English", "Spanish", "French", "German"] + label: LANGUAGE.%% + NAME: + values: ["Kyle", "Cristiano", "Elise", "Ada"] + label: NAME.%% + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SCRIPTS) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + python3 $(SCRIPTS)/say_hello.py --language $(LANGUAGE) --name $(NAME) --outfile $(WORKSPACE)/hello_output.txt + depends: [copy_scripts] + ``` + + Here, the `label` value for each parameter will be of the form `PARAMETER_NAME.parameter_value`. So, since our `say_hello` step is using two parameters `LANGUAGE` and `NAME`, the names of our parameter set subdirectories will be `say_hello_LANGUAGE..NAME.`. + + We have four parameter sets here and therefore we will get four parameter set subdirectories: + + | Parameter Set | Subdirectory Label | + | ------------- | ------------------ | + | `{"LANGUAGE": "English", "NAME": "Kyle"}` | `say_hello_LANGUAGE.English.NAME.Kyle` | + | `{"LANGUAGE": "Spanish", "NAME": "Cristiano"}` | `say_hello_LANGUAGE.Spanish.NAME.Cristiano` | + | `{"LANGUAGE": "French", "NAME": "Elise"}` | `say_hello_LANGUAGE.French.NAME.Elise` | + | `{"LANGUAGE": "German", "NAME": "Ada"}` | `say_hello_LANGUAGE.German.NAME.Ada` | + + After running this study, your `say_hello` step workspace will look like so: + +
+ ![example workspace with parameter sets](../assets/images/interpreting_output/workspace-with-params.png) +
An Example Workspace With Parameter Sets
+
+ +## Output With Samples + +Adding [Samples](./specification.md#samples) to your workflow will result in two side effects to the output workspace: + +1. Files will be added to [the `merlin_info/` subdirectory](#the-merlin_info-subdirectory) +2. A sample hierarchy will be created in the step workspace for every step that uses samples + +This section will detail both additions to the output workspace. + +### Additions to `merlin_info` + +The `merlin_info/` subdirectory is helpful for tracking the generation of samples. Sample generation will produce three additional files in this directory: + +1. `cmd.sh`: The file that contains the command that was ran to generate samples +2. `cmd.out`: The stdout generated from executing `cmd.sh` +3. `cmd.err`: The stderr generated from executing `cmd.sh`. This is one of the most useful places to look for debugging issues with sample generation. + +A visual representation of the `merlin_info/` subdirectory with sample generation is shown below. + +
+ ![merlin info subdirectory with samples](../assets/images/interpreting_output/merlin-info-with-samples.png) +
The "merlin_info/" Subdirectory With Sample Generation Files
+
+ +??? example "Exploring the `cmd` Files From Sample Generation" + + To use samples for this example, we first need a script to generate these samples. We'll create a `make_samples.py` script and store it in the same location as the spec file and the `say_hello.py` file. + + ```python title="make_samples.py" + import argparse + + import names + import numpy as np + + # argument parsing + parser = argparse.ArgumentParser(description="Make some samples (names of people).") + parser.add_argument("--number", type=int, action="store", help="the number of samples you want to make") + parser.add_argument("--filepath", type=str, help="output file") + args = parser.parse_args() + + # sample making + all_names = np.loadtxt(names.FILES["first:female"], dtype=str, usecols=0) + selected_names = np.random.choice(all_names, size=args.number) + + result = "" + name_list = list(selected_names) + result = "\n".join(name_list) + + with open(args.filepath, "w") as f: + f.write(result) + ``` + + Since this script uses some third party libraries ([`names`](https://pypi.org/project/names/) and [`numpy`](https://numpy.org/)), you'll need to install them to your current environment in order to run this example. If you're using a [virtual environment](./installation.md#installing-with-virtual-environments--pip), these can be installed with: + + ```bash + pip install names numpy + ``` + + Expanding on the previous examples, let's add in sample generation to our spec file. We'll hold off on actually *using* these samples until the next example: + + ```yaml title="output_workspace_demo.yaml" hl_lines="10 26 35-41" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts + N_SAMPLES: 3 + + global.parameters: + LANGUAGE: + values: ["English", "Spanish", "French", "German"] + label: LANGUAGE.%% + NAME: + values: ["Kyle", "Cristiano", "Elise", "Ada"] + label: NAME.%% + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | # (1) + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SPECROOT)/make_samples.py $(SCRIPTS) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + python3 $(SCRIPTS)/say_hello.py --language $(LANGUAGE) --name $(NAME) --outfile $(WORKSPACE)/hello_output.txt + depends: [copy_scripts] + + merlin: + samples: + generate: + cmd: | # (2) + python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [NAME_FROM_SAMPLE] + ``` + + 1. We'll copy the `make_samples.py` file to the `merlin_info/scripts/` folder for provenance + 2. Even though we're copying the `make_samples.py` script to `$(SCRIPTS)` in the `copy_scripts` step, we still have to use the initial location of the `make_samples.py` file which in this case is `$(SPECROOT)`. This is required since the sample generation will happen before any steps are executed. + + Running this example will produce the following `merlin_info/` subdirectory: + + ```bash + studies/output_workspace_demo_/merlin_info/ + ├── cmd.err + ├── cmd.out + ├── cmd.sh + ├── output_workspace_demo.expanded.yaml + ├── output_workspace_demo.orig.yaml + ├── output_workspace_demo.partial.yaml + ├── samples.csv + └── scripts + ├── make_samples.py + └── say_hello.py + ``` + + There are now a total of 5 new files compared to when we ran this without sample generation. Let's take a look at each: + + === "cmd.sh" + + The `cmd.sh` file will contain the exact command used to generate the samples that we saved to `samples.csv`: + + ```bash title="cmd.sh" + python3 /path/to/make_samples.py --filepath=/path/to/studies/output_workspace_demo_/merlin_info/samples.csv --number=3 + ``` + + === "cmd.out" + + The `cmd.out` file will contain any output sent to stdout from the `cmd.sh` script. For this example, there is no print statements in `make_samples.py` so this file will be empty. + + === "cmd.err" + + The `cmd.err` file will contain any output sent to stderr from the `cmd.sh` script. For this example, there aren't any errors raised or output sent to stderr so this file will be empty. + + === "samples.csv" + + The `samples.csv` file will contain the samples generated by executing the `cmd.sh` script that runs the `make_samples.py` file. For example, this file may look like: + + ```csv title="samples.csv" + JAZMIN + BERNADETTE + ODESSA + ``` + + === "make_samples.py" + + The `make_samples.py` file has been copied to the `scripts` folder for better workflow provenance. This will help us track exactly what was executed in this workflow. + +### The Sample Hierarchy + +After samples are generated (as was shown in the example of the last section) they can be used within steps of your workflow just like [parameters](./specification.md#the-globalparameters-block) or [variables](./variables.md). Similar to how parameters modify a step's output workspace, the use of samples will also affect the output workspace, albeit in a slightly different manner. When samples are used a sample hierarchy is created in the step workspace. + +The sample hierarchy structure will be based on the number of samples used in your workflow. This hierarchy will be numerically based (e.g. `00/`, `01/`, `02/`, etc.). + +??? example "Sample Hierarchy Demonstration" + + We'll add a new step to the spec file from "Exploring the `cmd` Files From Sample Generation" since it already generates samples for us. In this new step we'll utilize the `NAME_FROM_SAMPLE` sample: + + ```yaml title="output_workspace_demo.yaml" hl_lines="28-31" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts + N_SAMPLES: 3 + + global.parameters: + LANGUAGE: + values: ["English", "Spanish", "French", "German"] + label: LANGUAGE.%% + NAME: + values: ["Kyle", "Cristiano", "Elise", "Ada"] + label: NAME.%% + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SPECROOT)/make_samples.py $(SCRIPTS) + + - name: echo_samples + description: Echo out the sample names that we generated + run: + cmd: echo $(NAME_FROM_SAMPLE) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + python3 $(SCRIPTS)/say_hello.py --language $(LANGUAGE) --name $(NAME) --outfile $(WORKSPACE)/hello_output.txt + depends: [copy_scripts] + + merlin: + samples: + generate: + cmd: | + python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [NAME_FROM_SAMPLE] + ``` + + Since `N_SAMPLES` is 3, we're only generating 3 samples here. Therefore, after running this workflow the `echo_samples` step will have a single-level sample hierarchy with three entries: `00/`, `01/`, and `02/`. + + A visual represntation of this hierarchy is shown below. + +
+ ![example workspace with samples](../assets/images/interpreting_output/workspace-with-samples.png) +
An Example Sample Hierarchy
+
+ + In each sample directory, there are still the typical `.sh`, `.out`, `.err`, and `MERLIN_FINISHED` files. For this example, the only differences between these files will be the name of the sample that's being echoed. + +By default, Merlin allows for a maximum of 25 directories in each level of the sample hierarchy. This can be modified with the `level_max_dirs` key in the [`samples`](./specification.md#samples) property of the `merlin` block. + +??? example "Modifying The Hierarchy Structure" + + Let's use the same spec as the previous example, however, we'll up the number of samples to 50: + + ```yaml title="output_workspace_demo.yaml" hl_lines="10" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts + N_SAMPLES: 50 + + global.parameters: + LANGUAGE: + values: ["English", "Spanish", "French", "German"] + label: LANGUAGE.%% + NAME: + values: ["Kyle", "Cristiano", "Elise", "Ada"] + label: NAME.%% + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SPECROOT)/make_samples.py $(SCRIPTS) + + - name: echo_samples + description: Echo out the sample names that we generated + run: + cmd: echo $(NAME_FROM_SAMPLE) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | + python3 $(SCRIPTS)/say_hello.py --language $(LANGUAGE) --name $(NAME) --outfile $(WORKSPACE)/hello_output.txt + depends: [copy_scripts] + + merlin: + samples: + generate: + cmd: | + python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [NAME_FROM_SAMPLE] + ``` + + If we run this, we'll see two levels to our sample hierarchy. The top level will have two directories `00/` and `01/` and the second level of each will have `00/`-`24/` for a total of 50 leaf directories (one for each sample) in the sample hierarchy tree. A visual representation of this structure is shown below. + +
+ ![two-level sample hierarchy](../assets/images/interpreting_output/two-level-sample-hierarchy.png) +
A Two Level Sample Hierarchy
+
+ + This behavior happens because of Merlin's default maximum of 25 directories per level. Let's modify this maximum so we change the structure of our sample hierarchy. In the `merlin` block of our spec file, we'll add the `level_max_dirs` property: + + ```yaml title="output_workspace_demo.yaml" hl_lines="12" + . + . + . + + merlin: + samples: + generate: + cmd: | + python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [NAME_FROM_SAMPLE] + level_max_dirs: 10 + ``` + + Now there will be 5 directories `00/`-`04/` at the top level of the hierarchy and ten subdirectories `00/`-`09/` in each for a total of 50 leaf directories (since we still have 50 samples in our workflow). A visual representation of this is shown below. + +
+ ![modified hierarchy structure](../assets/images/interpreting_output/modified-hierarchy-structure.png) +
Example of a Modified Hierarchy Structure
+
+ +## Output With Parameters and Samples + +One of the most powerful functionalities of Merlin is the ability to combine both parameters and samples in the same step. When this is done, each sample is ran for every parameter set. As a result, this means the number of output directories increases when compared to using *just* parameters or *just* samples. + +The structure of a step workspace with both parameters and samples will have parameter labeled workspaces at the top level with a sample hierarchy nested below. A visual representation of this structure can be seen below. + +
+ ![workspace with parameters and samples](../assets/images/interpreting_output/workspace-with-params-and-samples.png) +
Layout of a Workspace With Parameters and Samples
+
+ +??? example "Example Step Workspace With Parameters & Samples" + + Let's modify the spec file from the "Modifying The Hierarchy Structure" example to have the `say_hello` step use both parameters and samples. To accomplish we'll need to: + + 1. Remove the `NAME` parameter as we'll use names from our samples instead + 2. Change the `column_label` of our sample to be `NAME` instead of `NAME_FROM_SAMPLE` + 3. Modify the `echo_samples` step to use `NAME` instead of `NAME_FROM_SAMPLE` + 4. Modify the `say_hello` step to add `$(MERLIN_SAMPLE_PATH)` to the path of the output file + + The modified spec file is shown below: + + ```yaml title="output_workspace_demo.yaml" hl_lines="10 16 29 35 44" + description: + name: $(STUDY_NAME) + description: An example showcasing the structure and contents of the output workspace + + env: + variables: + STUDY_NAME: output_workspace_demo + OUTPUT_PATH: ./studies + SCRIPTS: $(MERLIN_INFO)/scripts + N_SAMPLES: 2 # (1) + + global.parameters: + LANGUAGE: + values: ["English", "Spanish", "French", "German"] + label: LANGUAGE.%% + # (2) + + study: + - name: copy_scripts + description: Copy the scripts we need for this workflow to the "merlin_info" directory + run: + cmd: | + mkdir $(SCRIPTS) + cp $(SPECROOT)/say_hello.py $(SPECROOT)/make_samples.py $(SCRIPTS) + + - name: echo_samples + description: Echo out the sample names that we generated + run: + cmd: echo $(NAME) # (3) + + - name: say_hello + description: Run a script to say hello to a name + run: + cmd: | # (4) + python3 $(SCRIPTS)/say_hello.py --language $(LANGUAGE) --name $(NAME) --outfile $(WORKSPACE)/$(MERLIN_SAMPLE_PATH)/hello_output.txt + depends: [copy_scripts] + + merlin: + samples: + generate: + cmd: | + python3 $(SPECROOT)/make_samples.py --filepath=$(MERLIN_INFO)/samples.csv --number=$(N_SAMPLES) + file: $(MERLIN_INFO)/samples.csv + column_labels: [NAME] # (5) + ``` + + 1. We're dropping the number of samples from 50 back down to 2 for simplicity + 2. We're removing the `NAME` parameter as we'll be replacing it with the sample we're generating + 3. Change the variable reference from `NAME_FROM_SAMPLE` to `NAME` since we changed the `column_label` of our sample + 4. The `$(NAME)` reference here will now reference the sample since we removed the `NAME` parameter and changed the sample column label from `NAME_FROM_SAMPLE` to `NAME`. Additionally, the outfile path will now have the sample path embedded in it. + 5. We'll change the name from `NAME_FROM_SAMPLE` to `NAME` for readability + + Running this study will result in the following step workspace for `say_hello`: + + ```bash + studies/output_workspace_demo_/say_hello/ + ├── LANGUAGE.English + │ ├── 00 + │ │ ├── hello_output.txt + │ │ ├── MERLIN_FINISHED + │ │ ├── say_hello_LANGUAGE.English.err + │ │ ├── say_hello_LANGUAGE.English.out + │ │ └── say_hello_LANGUAGE.English.sh + │ └── 01 + │ ├── hello_output.txt + │ ├── MERLIN_FINISHED + │ ├── say_hello_LANGUAGE.English.err + │ ├── say_hello_LANGUAGE.English.out + │ └── say_hello_LANGUAGE.English.sh + ├── LANGUAGE.French + │ ├── 00 + │ │ ├── hello_output.txt + │ │ ├── MERLIN_FINISHED + │ │ ├── say_hello_LANGUAGE.French.err + │ │ ├── say_hello_LANGUAGE.French.out + │ │ └── say_hello_LANGUAGE.French.sh + │ └── 01 + │ ├── hello_output.txt + │ ├── MERLIN_FINISHED + │ ├── say_hello_LANGUAGE.French.err + │ ├── say_hello_LANGUAGE.French.out + │ └── say_hello_LANGUAGE.French.sh + ├── LANGUAGE.German + │ ├── 00 + │ │ ├── hello_output.txt + │ │ ├── MERLIN_FINISHED + │ │ ├── say_hello_LANGUAGE.German.err + │ │ ├── say_hello_LANGUAGE.German.out + │ │ └── say_hello_LANGUAGE.German.sh + │ └── 01 + │ ├── hello_output.txt + │ ├── MERLIN_FINISHED + │ ├── say_hello_LANGUAGE.German.err + │ ├── say_hello_LANGUAGE.German.out + │ └── say_hello_LANGUAGE.German.sh + └── LANGUAGE.Spanish + ├── 00 + │ ├── hello_output.txt + │ ├── MERLIN_FINISHED + │ ├── say_hello_LANGUAGE.Spanish.err + │ ├── say_hello_LANGUAGE.Spanish.out + │ └── say_hello_LANGUAGE.Spanish.sh + └── 01 + ├── hello_output.txt + ├── MERLIN_FINISHED + ├── say_hello_LANGUAGE.Spanish.err + ├── say_hello_LANGUAGE.Spanish.out + └── say_hello_LANGUAGE.Spanish.sh + ``` + + From this we see that both names that we generated (our samples in this example) were ran for all four languages that we specified (our parameter sets in this example). \ No newline at end of file diff --git a/docs/user_guide/running_studies.md b/docs/user_guide/running_studies.md new file mode 100644 index 000000000..91215c19e --- /dev/null +++ b/docs/user_guide/running_studies.md @@ -0,0 +1,520 @@ +# Running Studies + +Once you have created a [spec file](./specification.md) for your workflow, it can be ran either [locally](#local-runs) or in a [distributed manner](#distributed-runs). + +Additionally, any [user variable](./variables.md#user-variables) created in your spec file can be [modified from the command line](#command-line-substitution). + +Being able to run in a distributed manner and the support for command line substitution opens the door to [iterative runs](#iterative-runs) (see the [Iterative Demo](../examples/iterative.md) for a full demonstration of this process). + +## Local Runs + +!!! warning + + You should __*not*__ execute production runs locally. Instead, execute them [in a distributed manner](#distributed-runs). + + Local runs should be for testing workflow execution at very small scales. + +At the start of this [User Guide](./index.md) it's mentioned that Merlin uses a producer consumer model that relies on a centralized server. Sometimes it may be faster and easier to test the execution of a workflow without using this centralized server. This is where local runs come in handy. + +In a local run, tasks are executed sequentially in your current shell and are never sent to the broker. Therefore, workers do not need to be started for a local run to execute. + +Local runs are typically used when designing a workflow. They help provide users a quick way to run through their study to ensure every step is executing as expected prior to doing larger production runs that should be done [in a distributed manner](#distributed-runs). + +To run a study locally use the `--local` option of the [`merlin run`](./command_line.md#run-merlin-run) command. + +```bash +merlin run --local my_specification.yaml +``` + +## Distributed Runs + +!!! warning + + Distributed runs __*require*__ a connection to both the broker and results server. See the [Configuration](./configuration/index.md) page for instructions on how to set up these connections if you haven't done so already. + + If you can run `merlin info` and see no errors then your connection is working. + +Most of the runs done in Merlin will likely be distributed runs. These runs require: + +1. Tasks representing how to execute your workflow +2. Workers to coordinate and execute these tasks + +In a distributed run, the [DAG](../faq.md#what-is-a-dag) created by [the `study` block](./specification.md#the-study-block) of your spec file is converted to [Celery tasks](https://docs.celeryq.dev/en/stable/userguide/tasks.html) which are then sent to queues that live on the [broker](./configuration/index.md#what-is-a-broker). To accomplish this Merlin provides the [`merlin run`](./command_line.md#run-merlin-run) command: + +```bash +merlin run my_specification.yaml +``` + +The diagram below displays the basic flow of this process: + +
+ ![merlin run diagram](../assets/images/running_studies/merlin-run-diagram.png) +
High-Level Overview of "merlin run"
+
+ + +Once tasks are queued on the broker, they will remain there until we initiate workers to process them. + +You can specify workers in [the `merlin` block](./specification.md#the-merlin-block) of your spec file. To initiate these workers on your current node, you can use the [`merlin run-workers`](./command_line.md#run-workers-merlin-run-workers) command. However, workers are typically launched in parallel which will require an allocation on your preferred HPC system prior to using the `merlin run-workers` command. + +=== "Launch In Parallel" + + A parallel batch allocation launch is configured to run a single worker per node. This worker will then launch a number of worker processes to manage the tasks. The number of worker processes that are launched depends on the `--concurrency` value provided to the workers and the number of nodes in your allocation that are running workers. In math terms: `num_workers = concurrency * num_nodes`. By default the concurrency value will be the number of CPUs on the node but this can be configured by users (see the [Configuring Celery Workers](./celery.md#configuring-celery-workers) section for more details). + + A full [Slurm](../faq.md#what-is-slurm) batch submission script to run the workflow on 3 nodes is shown below. + + ```bash title="workers.sbatch" + #!/bin/bash + #SBATCH -N 3 + #SBATCH -J Merlin + #SBATCH -t 30:00 + #SBATCH -p pdebug + #SBATCH --mail-type=ALL + #SBATCH -o merlin_workers_%j.out + + # Assumes you are running this in the same dir as the yaml file. + YAML_FILE=input.yaml + + # Source the merlin virtualenv (if using csh, change this to 'activate.csh' in the statement below) + source /bin/activate + + # Print out the workers command + merlin run-workers ${YAML_FILE} --echo + + # Run the workers on the allocation + merlin run-workers ${YAML_FILE} + + # Delay until the workers cease running + merlin monitor + ``` + + This script can be submitted using: + + ```bash + sbatch workers.sbatch + ``` + + Below is a visual demonstration of this worker startup process. Steps 1 and 2 in this diagram are both handled by the bash script given above. + + !!! note + + The robots in this diagram represent the workers that are started on each node of the allocation. This is the default behavior when workers are spun up. If you'd like to modify how many nodes a worker possesses, that can be customized in [the `resources` section](./specification.md#resources) of the `merlin` block in your spec file. + +
+ ![parallel worker launch diagram](../assets/images/running_studies/parallel-launch.png) +
Flowchart for Starting Workers In Parallel
+
+ +=== "Launch On The Current Node" + + !!! warning + + Launching workers on the current node should be done with caution as the default behavior is for them to occupy all CPUs on a node. This can be modified with the `--concurrency` value provided to the workers. See the [Configuring Celery Workers](./celery.md#configuring-celery-workers) section for more details. + + Workers can be launched on the current node by running: + + ```bash + merlin run-workers my_specification.yaml + ``` + + Below is a visual demonstration of this worker startup process. The robot represents the workers started on the current node. + +
+ ![current node worker launch diagram](../assets/images/running_studies/current-node-launch.png) +
Flowchart for Starting Workers On The Current Node
+
+ +The workers that you spin up will live on the node(s) of your allocation and communicate with both the broker (to pull tasks from the queues for execution) and the [results server](./configuration/index.md#what-is-a-results-backend) (to store task state information). The communication between the worker and the broker is two-way so that the worker can push new tasks to the server as the workflow executes and pull existing tasks from the server that need to be executed. Below is a visual representation of this communication: + +
+ ![worker/server communication diagram](../assets/images/running_studies/worker-server-communication.png) +
Communication Between Workers and the Central Server
+
+ +If we put everything from this section together, we can see the full picture of the producer-consumer model that Merlin utilizes in workflow runs: + +
+ ![producer-consumer model](../assets/images/running_studies/producer-consumer-model.png) +
The Producer-Consumer Model for Merlin
+
+ +Once finished executing, workers can be stopped with the [`merlin stop-workers`](./command_line.md#stop-workers-merlin-stop-workers) command: + +```bash +merlin stop-workers --spec my_specification.yaml +``` + +## Dry Runs + +!!! tip + + It may be helpful to have an understanding of [The Basics](./interpreting_output.md#the-basics) of the study output workspace in order to fully grasp how dry runs work with Merlin. + +!!! note + + If you wish to execute a workflow after dry-running it, simply use [`merlin restart`](#restart-merlin-restart) (to understand why this works, see the section below on [Restarting Workflows](#restarting-workflows)). + +'Dry run' means telling workers to create a study's workspace and all of its necessary subdirectories and scripts (with variables expanded) without actually executing the scripts. + +To dry-run a workflow, use `--dry` with [`merlin run`](./command_line.md#run-merlin-run): + +=== "Locally" + + ```bash + merlin run --local --dry + ``` + +=== "Distributed" + + ```bash + merlin run --dry ; merlin run-workers + ``` + +!!! example + + Say we have the following spec file: + + ```yaml title="dry_run_example.yaml" + description: + name: dry_run_example + description: An example workflow to demonstrate the dry run functionality in Merlin + + study: + - name: step_1 + description: Echo that we're in step_1 + run: + cmd: echo "in step_1" + + - name: step_2 + description: Echo that we're in step_2 + run: + cmd: echo "in step_2" + depends: [step_1] + ``` + + We can execute a dry run of this command with: + + ```bash + merlin run --local --dry dry_run_example.yaml + ``` + + This will provide us with a workspace like so: + + ```bash + dry_run_example_/ + ├── merlin_info + │ ├── dry_run_example.expanded.yaml + │ ├── dry_run_example.orig.yaml + │ └── dry_run_example.partial.yaml + ├── step_1 + │ └── step_1.sh + └── step_2 + └── step_2.sh + ``` + + Notice how there are no `.out`, `.err`, or `MERLIN_FINISHED` files. This is because the `.sh` scripts are generated but *not* executed. + +You can also specify dry runs from the workflow specification file by setting the `dry_run` property of [the `batch` block](./specification.md#the-batch-block) to `True`: + +```yaml +batch: + dry_run: True +``` + +If this is set then you won't need the `--dry` option when you use `merlin run`. + +!!! example + + Using the same example as above, we can modify the spec file by adding the `batch` block with `dry_run` set to `True`: + + ```yaml title="dry_run_example.yaml" hl_lines="5 6" + description: + name: dry_run_example + description: An example workflow to demonstrate the dry run functionality in Merlin + + batch: + dry_run: True + + study: + - name: step_1 + description: Echo that we're in step_1 + run: + cmd: echo "in step_1" + + - name: step_2 + description: Echo that we're in step_2 + run: + cmd: echo "in step_2" + depends: [step_1] + ``` + + Now we don't even need to specify the `--dry` option when we run the workflow: + + ```bash + merlin run --local dry_run_example.yaml + ``` + + This will produce the same workspace setup as the previous example. + +## Restarting Workflows + +!!! note + + We recommend having an understanding of [The Basics](./interpreting_output.md#the-basics) of the study output workspace in order to fully grasp how restarting works with Merlin. + +Certain tasks in your workflow may fail at times for various reasons. Often times this can be resolved by simply restarting the workflow and having the task run again. + +Much like a normal run of a workflow, a restart can happen either locally or in a distributed manner. + +=== "Distributed" + + !!! warning + + Make sure you have set up your [Configuration](./configuration/index.md) before doing a distributed restart. + + Workflows can be restarted using the [`merlin restart`](./command_line.md#restart-merlin-restart) command: + + ```bash + merlin restart study_workspace_/ + ``` + +=== "Local" + + Workflows can be restarted locally using the `--local` option of the [`merlin restart`](./command_line.md#restart-merlin-restart) command: + + ```bash + merlin restart study_workspace_/ --local + ``` + +This command will re-queue all tasks in your workflow but will only execute the tasks that *do not* have a `MERLIN_FINISHED` file in their output workspace. + +!!! example + + Say we had a workflow run where our third sample in `step_1` failed to run because of a timeout with Slurm, so `step_2` was never ran. Our resulting workspace from this run may look like: + + ```bash + example_study/ + ├── merlin_info/ + │ ├── example_study.expanded.yaml + │ ├── example_study.orig.yaml + │ └── example_study.partial.yaml + └── step_1/ + ├── 00 + │ ├──MERLIN_FINISHED + │ ├── step_1.slurm.err + │ ├── step_1.slurm.out + │ └── step_1.slurm.sh + ├── 01 + │ ├── MERLIN_FINISHED + │ ├── step_1.slurm.err + │ ├── step_1.slurm.out + │ └── step_1.slurm.sh + └── 02 + ├── step_1.slurm.err + ├── step_1.slurm.out + └── step_1.slurm.sh + ``` + + Restarting this workspace with: + + ```bash + merlin restart example_study/ + ``` + + ...would queue up all 3 samples again but only execute the third sample since it didn't have a `MERLIN_FINISHED` file. From there, the execution of the workflow would continue and `step_2` would be ran after the third sample finished processing. + +## Command Line Substitution + +Merlin allows for the command line substitution of [user variables](./variables.md#user-variables) defined in [the `env` block](./specification.md#the-env-block) of your spec file. This feature is applicable to any merlin command that supports the `--vars` option. To check which commands support the `--vars` option, either view the [Command Line Interface](./command_line.md) page or use the `--help` option with any Merlin command to see what options are available. + +**Syntax** + +```bash +merlin COMMAND --vars KEY1=value1 KEY2=value2 +``` + +When CLI substitution is used, the `.orig.yaml` file in the `merlin_info/` subdirectory of the output workspace will not show the changes. Instead, you can see that the substitution happened by viewing the `.partial.yaml` and/or the `.expanded.yaml` files of the `merlin_info/` subdirectory. + +??? example "Using CLI Substitution" + + In this example we'll use a spec file that calculates cylindrical tank flow rate with water as the liquid. Here we'll have two variables that we'll modify from the CLI, `RADIUS` and `HEIGHT`, to change the size of the tank: + + ```yaml title="cylindrical_flow_rate.yaml" hl_lines="7 8" + description: + name: cylindrical_flow_rate + description: An example workflow to calculate cylindrical tank flow rate with water as the liquid. + + env: + variables: + RADIUS: 2 # Units are meters + HEIGHT: 5 # Units are meters + HEIGHT_DIFFERENCE: 1 # Assume a height difference for the flow (e.g., 1 meter) + GRAV_ACCELERATION: 9.8 # Assume gravitational acceleration g = 9.8 m/s^2 + + study: + - name: calculate_flow_rate + description: calculate the flow rate of the cylindrical tank + run: + cmd: | + import math + + # Calculate volume of the cylindrical tank + volume = math.pi * $(RADIUS)**2 * $(HEIGHT) + + # Calculate area of the opening + opening_area = math.pi * $(RADIUS)**2 + + # Calculate velocity of water flow + velocity = math.sqrt(2 * $(GRAV_ACCELERATION) * $(HEIGHT_DIFFERENCE)) + + # Calculate flow rate + flow_rate = opening_area * velocity + + # Write results to output file + with open(f"$(WORKSPACE)/flow_rate.out", "w") as flow_file: + flow_file.write(f"Volume of the cylindrical tank: {volume:.2f} cubic meters\nFlow rate of water: {flow_rate:.2f} cubic meters per second") + shell: /usr/bin/env python3 + ``` + + Running this normally with + + ```bash + merlin run --local cylindrical_flow_rate.yaml + ``` + + ...provides us with the following output: + + ```title="flow_rate.out" + Volume of the cylindrical tank: 62.83 cubic meters + Flow rate of water: 55.63 cubic meters per second + ``` + + Now let's make the tank bigger from the command line: + + ```bash + merlin run --local cylindrical_flow_rate.yaml --vars RADIUS=14 HEIGHT=20 + ``` + + We can see how this substitution does *not* show up in the `.orig.yaml` file of the `merlin_info/` subdirectory of the output workspace but it *does* show up in the `.partial.yaml` and `.expanded.yaml` files: + + === "Orig" + + ```yaml title="cylindrical_flow_rate.orig.yaml" hl_lines="7 8" + description: + name: cylindrical_flow_rate + description: An example workflow to demonstrate CLI substitution with Merlin. This workflow is calculating cylindrical tank flow rate with water as the liquid. + + env: + variables: + RADIUS: 2 # Units are meters + HEIGHT: 5 # Units are meters + HEIGHT_DIFFERENCE: 1 # Assume a height difference for the flow (e.g., 1 meter) + GRAV_ACCELERATION: 9.8 # Assume gravitational acceleration g = 9.8 m/s^2 + ``` + + === "Partial & Expanded" + + ```yaml hl_lines="12 13" + description: + name: cylindrical_flow_rate + description: An example workflow to demonstrate CLI substitution with Merlin. This workflow is calculating cylindrical tank flow rate with water as the liquid. + + batch: + type: local + dry_run: false + shell: /bin/bash + + env: + variables: + RADIUS: 14 + HEIGHT: 20 + HEIGHT_DIFFERENCE: 1 + GRAV_ACCELERATION: 9.8 + . + . + . + ``` + + As expected, this run will provide us with different output: + + ```title="flow_rate.out" + Volume of the cylindrical tank: 12315.04 cubic meters + Flow rate of water: 2726.05 cubic meters per second + ``` + +## Iterative Runs + +With the producer/consumer model from [Distributed Runs](#distributed-runs) and [Command Line Substitution](#command-line-substitution), suddenly iterative workflows with Merlin become possible. + +Most iterative workflows can follow the same general template for iterating. These workflows can be created with three easy steps: + +1. Adding a variable to track the current iteration number +2. Adding a variable to set the max number of iterations +3. Creating an iteration step in your workflow that starts the next iteration using command line substitution + +Below is a demo spec file showcasing these steps: + +```yaml title="example_iterative_wf.yaml" +description: + name: example_iterative_wf_iter_$(ITER) + description: An example iterative workflow + +env: + variables: + ITER: 1 # (1) + MAX_ITER: 10 # (2) + +study: + - name: iterate # (3) + description: Determine whether to iterate or to stop the workflow + run: + cmd: | + # Check to see if we should stop iterating + if [ $(ITER) -ge $(MAX_ITER) ] ; then + echo "done" + else + # Up the iteration count by one + next_iter=$(ITER) + ((next_iter=next_iter+1)) + echo "Starting iteration " $next_iter + + # Move back to the SPECROOT so that the output of our next run isn't nested in the current run + cd $(SPECROOT) + + # Use command line substitution to pass in the next iteration value + merlin run $(SPECROOT)/example_iterative_wf.yaml --vars ITER=$next_iter + fi +``` + +1. Set the initial iteration number to be 1 +2. Set the max number of iterations to be 10 +3. The iteration step should almost always be the last step in your workflow + +!!! note + + For a more detailed example, see the [Iterative Demo](../examples/iterative.md). + +With this setup, you'll only need to manually submit the first iteration of your workflow with [`merlin run`](./command_line.md#run-merlin-run): + +```bash +merlin run example_iterative_wf.yaml +``` + +The following iterations will be queued by the workers when they process the `iterate` step and use the nested `merlin run` command. + +Additionally, we only have to start our workers with [`merlin run-workers`](./command_line.md#run-workers-merlin-run-workers) one time before the first iteration: + +```bash +merlin run-workers example_iterative_wf.yaml +``` + +Since they remain alive until we manually stop them with [`merlin stop-workers`](./command_line.md#stop-workers-merlin-stop-workers) the workers will continue to watch the queues on the broker until we tell them not to. + +Putting everything together, this means the workers will act as both the producer and the consumer for this workflow. They will be producing tasks with the iterative call to `merlin run` and then consuming those tasks when they pull them from the queue(s) on the broker for execution. + +Below is a visual representation of the iterative process. + +
+ ![iterative diagram](../assets/images/running_studies/iterative-diagram.png) +
Processing Diagram for Iterative Workflows
+
\ No newline at end of file diff --git a/docs/user_guide/specification.md b/docs/user_guide/specification.md new file mode 100644 index 000000000..53c682875 --- /dev/null +++ b/docs/user_guide/specification.md @@ -0,0 +1,725 @@ +# The Specification File + +At the core of Merlin is the specification (spec) file. This file is used to define how workflows should be created and executed by Merlin, and is also utilized as a way for users to keep records of their studies. + +Merlin enables several blocks in the spec file, each with their own purpose: + +| Block Name | Required? | Description | +| ---------- | --------- | ----------- | +| [`description`](#the-description-block) | Yes | General information about the study | +| [`env`](#the-env-block) | No | Fixed constants and other values that are globally set and referenced | +| [`global.parameters`](#the-globalparameters-block) | No | Parameters that are user varied and applied to the workflow | +| [`batch`](#the-batch-block) | No | Settings for submission to batch systems | +| [`study`](#the-study-block) | Yes | Steps that the study is composed of and are executed in a defined order | +| [`merlin`](#the-merlin-block) | No | Worker settings and sample generation handling | +| [`user`](#the-user-block) | No | YAML anchor definitions | + +This module will go into detail on every block and the properties available within each. + +## The `description` Block + +Since Merlin is built as extension of [Maestro](https://maestrowf.readthedocs.io/en/latest/index.html), most of the behavior of the `description` block is inherited directly from Maestro. Therefore, we recommend reading [Maestro's documentation on the `description` Block](https://maestrowf.readthedocs.io/en/latest/Maestro/specification.html#description-description) for the most accurate description of how it should be used. + +There is one difference between Merlin and Maestro when it comes to the `description` block: the use of variables. With Merlin, the `description` block can use variables defined in [the `env` block](#the-env-block). + +!!! example "Using Variables in the `description` Block" + + ```yaml + description: + name: $(STUDY_NAME) + description: An example showcasing how variables can be used in the description block + + env: + variables: + STUDY_NAME: variable_study_name + ``` + +## The `env` Block + +Since Merlin is built as extension of [Maestro](https://maestrowf.readthedocs.io/en/latest/index.html), the behavior of the `env` block is inherited directly from Maestro. Therefore, we recommend reading [Maestro's documentation on the `env` block](https://maestrowf.readthedocs.io/en/latest/Maestro/specification.html#environment-env) for the most accurate description of how it should be used. + +For more information on how variables defined in this block can be used, check out the [Variables](./variables.md) page (specifically the [Token Syntax](./variables.md#token-syntax), [User Variables](./variables.md#user-variables), and [Environment Variables](./variables.md#environment-variables) sections). + +!!! example "A Basic `env` Block" + + ```yaml + env: + variables: + N_SAMPLES: 10 + OUTPUT_PATH: /path/to/study_output/ + ``` + +## The `global.parameters` Block + +Since Merlin is built as extension of [Maestro](https://maestrowf.readthedocs.io/en/latest/index.html), the behavior of the `global.parameters` block is inherited directly from Maestro. Therefore, we recommend reading [Maestro's documentation on the `global.parameters` block](https://maestrowf.readthedocs.io/en/latest/Maestro/specification.html#parameters-globalparameters) for the most accurate description of how it should be used. + +It would also be a good idea to read through [Specifying Study Parameters](https://maestrowf.readthedocs.io/en/latest/Maestro/parameter_specification.html) from Maestro which goes into further detail on how to use parameters in your study. There you will also find details how to programmatically generate parameters using [`pgen`](https://maestrowf.readthedocs.io/en/latest/Maestro/parameter_specification.html#parameter-generator-pgen). + +!!! example "A Basic `global.parameters` Block" + + ```yaml + global.parameters: + RADIUS: + values: [2, 5, 10, 20] + label: RADIUS.%% + HEIGHT: + values: [5, 10, 30, 60] + label: HEIGHT.%% + ``` + +## The `batch` Block + +!!! warning + + Although the `batch` block exists in both Maestro and Merlin spec files, this block will differ slightly in Merlin. + +!!! tip + + This block is frequently used in conjunction with the [`LAUNCHER` and `VLAUNCHER` variables](./variables.md#the-launcher-and-vlauncher-variables). + +The `batch` block is an optional block that enables specification of HPC scheduler information to enable writing steps that are decoupled from particular machines and thus more portable/reusable. Below are the base properties for this block. + +| Property Name | Required? | Type | Description | +| ------------- | --------- | ---- | ----------- | +| `bank` | Yes | str | Account to charge computing time to | +| `dry_run` | No | bool | Execute a [dry run](./command_line.md#dry-run) of the study | +| `launch_args` | No | str | Extra arguments for the parallel launch command | +| `launch_pre` | No | str | Any configuration needed before the scheduler launch command (`srun`, `jsrun`, etc.) | +| `nodes` | No | int | The number of nodes to use for all workers. This can be overridden in [the `resources` property of the `merlin` block](#resources). If this is unset the number of nodes will be queried from the environment, failing that, the number of nodes will be set to 1. | +| `queue` | Yes | str | Scheduler queue/partition to submit jobs (study steps) to | +| `shell` | No | str | Optional specification path to the shell to use for execution. Defaults to `/bin/bash` | +| `type` | Yes | str | Type of scheduler managing execution. One of: `local`, `flux`, `slurm`, `lsf`, `pbs` | +| `walltime` | No | str | The total walltime of the batch allocation (hh\:mm:ss or mm:ss or ss) | +| `worker_launch` | No | str | Override the parallel launch defined in Merlin | + +If using `flux` as your batch type, there are a couple more properties that you can define here: + +| Property Name | Type | Description | +| ------------- | ---- | ----------- | +| `flux_exec` | str | Optional flux exec command to launch workers on all nodes if `flux_exec_workers` is True | +| `flux_exec_workers` | bool | Optional flux argument to launch workers on all nodes | +| `flux_path` | str | Optional path to flux bin | +| `flux_start_opts` | str | Optional flux start options | + +Below are examples of different scheduler set ups. The only required keys in each of these examples are: `type`, `queue`, and `bank`. + +=== "Local" + + ```yaml + batch: + type: local + queue: pbatch + bank: baasic + ``` + +=== "Slurm" + + ```yaml + batch: + type: slurm + queue: pbatch + bank: baasic + walltime: "08:30:00" + ``` + +=== "LSF" + + ```yaml + batch: + type: lsf + queue: pbatch + bank: baasic + nodes: 2 + ``` + +=== "Flux" + + ```yaml + batch: + type: flux + queue: pbatch + bank: baasic + launch_pre: export FLUX_STRT_TIME=`date -u +%Y-%m-%dT%H:%M:%S.%6NZ` + ``` + +## The `study` Block + +!!! warning + + Although the `study` block exists in both Maestro and Merlin spec files, this block will differ slightly in Merlin. + +The `study` block is where the steps to be executed in the Merlin study are defined. The steps that are defined here will ultimately create the [DAG](../faq.md#what-is-a-dag) that's executed by Merlin. + +This block represents the unexpanded set of steps that the study is composed of. Here, unexpanded means no parameter nor sample substitution; the steps only contain references to the parameters and/or samples. Steps are given as a list (- prefixed) of properties: + +| Property Name | Required? | Type | Description | +| ------------- | --------- | ---- | ----------- | +| `name` | Yes | str | Unique name for identifying and referring to a step | +| `description` | Yes | str | A general description of what this step is intended to do | +| [`run`](#the-run-property) | Yes | dict | Properties that describe the actual specification of the step | + +### The `run` Property + +The `run` property contains several subproperties that define what a step does and how it relates to other steps. This is where you define the concrete shell commands the task needs to execute, step dependencies that dictate the topology of the DAG, and any `parameter`, `env`, or `sample` tokens to inject. + +| Property Name | Required? | Type | Description | +| ------------- | --------- | ---- | ----------- | +| `cmd` | Yes | str | The actual commands to be executed for this step | +| `depends` | No | List[str] | List of other steps which must successfully execute before this task can be executed | +| `max_retries` | No | int | The maximum number of retries allowed for this step | +| `restart` | No | str | Similar to `cmd`, providing optional alternate commands to run upon restarting, e.g. after a scheduler timeout | +| `retry_delay` | No | int | The time in seconds to delay a retry by | +| `shell` | No | str | The shell to execute `cmd` in (e.g. `/bin/bash`, `/usr/bin/env`, `python`) (default: `/bin/bash`) | +| `task_queue` | No | str | The name of the task queue to assign this step to. Workers will watch task queues to find tasks to execute. (default: `merlin`) | + +!!! example + + ```yaml + study: + - name: create_data + description: Use a python script to create some data + run: + cmd: | # (1) + echo "Creating data..." + python create_data.py --outfile data.npy + echo "Data created" + restart: | # (2) + echo "Restarted the data creation..." + python create_data.py --outfile data_restart.npy + echo "Data created upon restart" + max_retries: 3 # (3) + retry_delay: 5 # (4) + task_queue: create # (5) + + - name: transpose_data + description: Use python to transpose the data + run: + cmd: | # (6) + import numpy as np + import os + + data_file = "$(create_data.workspace)/data.npy" + if not os.path.exists(data_file): + data_file = "$(create_data.workspace)/data_restart.npy" + + initial_data = np.load(data_file) + transposed_data = np.transpose(initial_data) + np.save("$(WORKSPACE)/transposed_data.npy", transposed_data) + shell: /usr/bin/env python3 # (7) + task_queue: transpose + depends: [create_data] # (8) + ``` + + 1. The `|` character allows the `cmd` to become a multi-line string + 2. The `restart` command will be ran if the initial execution of `cmd` exits with a `$(MERLIN_RESTART)` [Return Code](./variables.md#step-return-variables) + 2. Only allow this step to retry itself 3 times + 3. Delay by 5 seconds on each retry + 4. All tasks created by this step will get sent to the `create` queue. They will live in this queue on the [broker](./configuration/index.md#what-is-a-broker) until a worker picks them up for execution. + 5. This step uses two variables `$(create_data.workspace)` and `$(WORKSPACE)`. They point to `create_data`'s output directory and `transpose_data`'s output directory, respectively. Read the section on [Reserved Variables](./variables.md#reserved-variables) for more information on these and other variables. + 6. Setting our shell to be `python3` allows us to write python in the `cmd` rather than bash scripting + 7. Since this step depends on the `create_data` step, it will not be ran until `create_data` finishes processing + +There are also a few optional properties for describing resource requirements to pass to the scheduler and associated [`$(LAUNCHER)`](./variables.md#the-launcher-and-vlauncher-variables) tokens used to execute applications on HPC systems. + +| Property Name | Required? | Type | Description | +| ------------- | --------- | ---- | ----------- | +| `batch` | No | dict | Override the `batch` block for this step | +| `nodes` | No | int | Number of nodes to reserve for executing this step: primarily used by `$(LAUNCHER)` expansion | +| `procs` | No | int | Number of processors needed for step execution: primarily used by `$(LAUNCHER)` expansion | +| `walltime` | No | str | Specifies maximum amount of time to reserve HPC resources for | + +!!! example + + ```yaml + batch: + type: flux + queue: pbatch + bank: baasic + + study: + - name: create_data + description: Use a python script to create some data + run: + cmd: + echo "Creating data..." + $(LAUNCHER) python create_data.py # (1) + echo "Data created" + nodes: 2 + procs: 4 + walltime: "30:00" + task_queue: create + ``` + + 1. The [`$(LAUNCHER)`](./variables.md#the-launcher-and-vlauncher-variables) token here will be expanded to `flux run -N 2 -n 4 -t 1800.0s` + +Additionally, there are scheduler specific properties that can be used. The sections below will highlight these properties. + +#### Slurm Specific Properties + +Merlin supports the following properties for Slurm: + +| Property Name | Equivalent `srun` Option | Type | Description | Default | +| ------------- | ------------------------ | ---- | ----------- | ------- | +| `cores per task` | `-c`, `--cpus-per-task` | int | Number of cores to use for each task | 1 | +| `reservation` | `--reservation` | str | Reservation to schedule this step to; overrides batch block | None | +| `slurm` | N/A | str | Verbatim flags only for the srun parallel launch. This will be expanded as follows for steps that use [`LAUNCHER` or `VLAUNCHER`](./variables.md#the-launcher-and-vlauncher-variables): `srun -N -n ... `. | None | + +!!! example + + The following example will run `example_slurm_step` with Slurm specific options `cores per task` and `slurm`. This will tell Merlin that this step needs 2 nodes, 4 cores per task, and to begin this at noon. + + ```yaml + batch: + type: slurm + queue: pbatch + bank: baasic + + study: + - name: example_slurm_step + description: A step using slurm specific options + run: + cmd: | + $(LAUNCHER) python3 do_something.py + nodes: 2 + cores per task: 4 + slurm: --begin noon + ``` + + Here, `$(LAUNCHER)` will become `srun -N 2 -c 4 --begin noon`. + +#### Flux Specific Properties + +Merlin supports the following Flux properties: + +| Property Name | Equivalent `flux run` Option | Type | Description | Default | +| ------------- | ---------------------------- | ---- | ----------- | ------- | +| `cores per task` | `-c`, `--cores-per-task` | int | Number of cores to use for each task | 1 | +| `gpus per task` | `-g`, `--gpus-per-task` | int | Number of gpus to use for each task | 0 | +| `flux` | N/A | str | Verbatim flags for the flux parallel launch. This will be expanded as follows for steps that use [`LAUNCHER` or `VLAUNCHER`](./variables.md#the-launcher-and-vlauncher-variables): `flux mini run ... ` | None | + +!!! example + + The following example will run `example_flux_step` with Flux specific options `cores per task` and `gpus per task`. This will tell Merlin that this step needs 2 nodes, 4 cores per task, and 1 gpu per task. + + ```yaml + batch: + type: flux + queue: pbatch + bank: baasic + + study: + - name: example_flux_step + description: A step using flux specific options + run: + cmd: | + $(LAUNCHER) python3 do_something.py + nodes: 2 + cores per task: 4 + gpus per task: 1 + ``` + + Here, `$(LAUNCHER)` will become `flux run -N 2 -c 4 -g 1`. + +#### LSF Specific Properties + +Merlin supports the following properties for LSF: + +| Property Name | Equivalent `jsrun` Option | Type | Description | Default | +| --------------------- | ------------------------- | ---- | ----------- | ------- | +| `bind` | `-b`, `--bind` | str | Flag for MPI binding of tasks on a node | `rs` | +| `cores per task` | `-c`, `--cpu_per_rs` | int | Number of cores to use for each task | 1 | +| `exit_on_error` | `-X`, `--exit_on_error` | int | Flag to exit on error. A value of `1` enables this and `0` disables it. | 1 | +| `gpus per task` | `-g`, `--gpu_per_rs` | int | Number of gpus to use for each task | 0 | +| `num resource set` | `-n`, `--nrs` | int | Number of resource sets. The `nodes` property will set this same flag for LSF so only do one or the other. | 1 | +| `launch_distribution` | `-d`, `--launch_distribution` | str | The distribution of resources | `plane:{procs/nodes}` | +| `lsf` | N/A | str | Verbatim flags only for the lsf parallel launch. This will be expanded as follows for steps that use [`LAUNCHER` or `VLAUNCHER`](./variables.md#the-launcher-and-vlauncher-variables): `jsrun ... ` | None | + +!!! example + + The following example will run `example_lsf_step` with LSF specific options `exit_on_error` and `bind`. This will tell Merlin that this step needs 2 nodes, to not exit on error, and to not have any binding. + + ```yaml + batch: + type: lsf + queue: pbatch + bank: baasic + + study: + - name: example_lsf_step + description: A step using lsf specific options + run: + cmd: | + $(LAUNCHER) python3 do_something.py + nodes: 2 + exit_on_error: 0 + bind: none + ``` + + Here, `$(LAUNCHER)` will become `jsrun -N 2 -X 0 -b none`. + + +## The `merlin` Block + +The `merlin` block is where you can customize Celery workers and generate samples to be used throughout the workflow. + +This block is split into two main properties: + +| Property Name | Required? | Type | Description | +| ------------- | --------- | ---- | ----------- | +| [`resources`](#resources) | No | dict | Define the task server configuration and workers to run the tasks | +| [`samples`](#samples) | No | dict | Define samples to be referenced in your study steps | + +Both of these properties have multiple subproperties so we'll take a deeper dive into each one below. + +### Resources + +!!! note + + Currently the only task server that Merlin supports is Celery. + +The `resources` property of the `merlin` block allows users to customize task server configuration and create custom workers to run tasks. This property has the following subproperties: + +| Property Name | Required? | Type | Description | +| ------------- | --------- | ---- | ----------- | +| `task_server` | No | str | The type of task server to use. **Currently "celery" is the only option.** (default: celery) | +| `overlap` | No | bool | Flag to determine if multiple workers can pull tasks from overlapping queues. (default: False) | +| `workers` | No | List[dict] | A list of worker definitions. | + +The `workers` subproperty is where you can create custom workers to process your workflow. The keys that you provide under this property will become the names of your custom workers. + +!!! example + + The following `merlin` block will create two workers named `data_creation_worker` and `data_transpose_worker`. + + ```yaml + merlin: + resources: + workers: + data_creation_worker: + + data_transpose_worker: + + ``` + +Each worker can be customized with the following settings: + +| Setting Name | Type | Description | +| ------------ | ---- | ----------- | +| `args` | str | Arguments to provide to the worker. Check out [Configuring Celery Workers](./celery.md#configuring-celery-workers) and/or [Celery's worker options](https://docs.celeryq.dev/en/main/reference/cli.html#celery-worker) for more info on what can go here.

Tip

The most common arguments used with `args` are `--concurrency`, `--prefetch-multiplier`, `-O fair`, and `-l`.

| +| `batch` | dict | Override the main `batch` config for this worker.

Tip

This setting is useful if other workers are running flux, but some component of the workflow requires the native scheduler or cannot run under flux.

Another possibility is to have the default `batch` type as `local` and then define workers needed for flux or slurm steps.

| +| `machines` | List[str] | A list of machines to run the given steps provided in the `steps` setting here. **A full `OUTPUT_PATH` and the `steps` argument are both _required_ for this setting. Currently all machines in the list must have access to the `OUTPUT_PATH`.**

Note

You'll need an allocation on any machine that you list here. You'll then have to run [`merlin run-workers`](./command_line.md#run-workers-merlin-run-workers) from any machine listed here. The `merlin run` command will only have to be ran once from any machine in order to send the tasks to the broker.

| +| `nodes` | int | Number of nodes for this worker to run on. (defaults to all nodes on your allocation) | +| `steps` | List[str] | A list of step names for this worker to "watch". The worker will *actually* be watching the `task_queue` associated with the steps listed here. (default: `[all]`) | + +??? example "Custom Worker for Each Step" + + This example showcases how to define custom workers that watch different steps in your workflow. Here, `data_creation_worker` will execute tasks created from the `create_data` step that are sent to the `create` queue, and `data_transpose_worker` will execute tasks created from the `transpose_data` step that are sent to the `transpose` queue. + + We're also showing how to vary worker arguments using some of the most common arguments for workers. + + ```yaml + study: + - name: create_data + description: Use a python script to create some data + run: + cmd: | + echo "Creating data..." + python create_data.py + echo "Data created" + task_queue: create # (1) + + - name: transpose_data + description: Use python to transpose the data + run: + cmd: | + import numpy as np + initial_data = np.load("$(create_data.workspace)/data.npy") + transposed_data = np.transpose(initial_data) + np.save("$(WORKSPACE)/transposed_data.npy", transposed_data) + shell: /usr/bin/env python3 + task_queue: transpose + depends: [create_data] + + merlin: + resources: + workers: + data_creation_worker: + args: -l INFO --concurrency 4 --prefetch-multiplier 1 -O fair # (2) + steps: [create_data] # (3) + + data_transpose_worker: + args: -l INFO --concurrency 1 --prefetch-multiplier 1 + steps: [transpose_data] + ``` + + 1. The name of the queue for this step is important as that is where the tasks required to execute this step will be stored on the [broker](./configuration/index.md#what-is-a-broker) until a worker (in this case `data_creation_worker`) pulls the tasks and executes them. + 2. Arguments here can be broken down as follows: + - `-l` sets the log level + - `--concurrency` sets the number of worker processes to spin up on each node that this worker is running on (Celery's default is to set `--concurrency` to be the number of CPUs on your node). More info on this can be found on [Celery's concurrency documentation](https://docs.celeryq.dev/en/stable/userguide/workers.html#concurrency). + - `--prefetch-multiplier` sets the number of messages to prefetch at a time multiplied by the number of concurrent processes (Celery's default is to set `--prefetch-multiplier` to 4). More info on this can be found on [Celery's prefetch multiplier documentation](https://docs.celeryq.dev/en/stable/userguide/configuration.html#std-setting-worker_prefetch_multiplier). + - `-O fair` sets the scheduling algorithm to be fair. This aims to distribute tasks more evenly based on the current workload of each worker. + 3. Here we tell `data_creation_worker` to watch the `create_data` step. What this *actually* means is that the `data_creation_worker` will go monitor the `task_queue` associated with the `create_data` step, which in this case is `create`. Any tasks sent to the `create` queue will be pulled and executed by the `data_creation_worker`. + +??? example "Custom Workers to Run Across Multiple Machines" + + This example showcases how you can define custom workers to be able to run on multiple machines. Here, we're assuming that both machines `quartz` and `ruby` have access to our `OUTPUT_PATH`. + + ```yaml + env: + variables: + OUTPUT_PATH: /path/to/shared/filespace/ + CONCURRENCY: 1 + + merlin: + resources: + workers: + cross_machine_worker: + args: -l INFO --concurrency $(CONCURRENCY) # (1) + machines: [quartz, ruby] # (2) + nodes: 2 # (3) + ``` + + 1. Variables can be used within worker customization. They can even be used to name workers! + 2. This worker will be able to start on both `quartz` and `ruby` so long as you have an allocation on both and execute [`merlin run-workers`](./command_line.md#run-workers-merlin-run-workers) from both machines. + 3. This worker will only start on 2 nodes of our allocation. + +### Samples + +The `samples` property of the `merlin` block allows users to generate, store, and create references to samples that can be used throughout a workflow. + +This property comes with several subproperties to assist with the handling of samples: + +| Property Name | Type | Description | +| ------------- | ---- | ----------- | +| `column_labels` | List[str] | The names of the samples stored in `file`. This will be how you reference samples in your workflow using [token syntax](./variables.md#token-syntax). | +| `file` | str | The name of the samples file where your samples are stored. **Must be either .npy, .csv, or .tab.** | +| `generate` | dict | Properties that describe how the samples should be generated | +| `level_max_dirs` | int | The number of sample output directories to generate at each level in the sample hierarchy of a step. See the "Modifying The Hierarchy Structure" example in [The Sample Hierarchy](./interpreting_output.md#the-sample-hierarchy) section for an example of how this is used. | + +Currently, within the `generate` property there is only one subproperty: + +| Property Name | Type | Description | +| ------------- | ---- | ----------- | +| `cmd` | str | The command to execute that will generate samples | + +!!! example "Basic Sample Generation & Usage" + + ```yaml + study: + - name: echo_samples + description: Echo the values of our samples + run: + cmd: echo "var1 - $(VAR_1) ; var2 - $(VAR_2)" # (1) + + merlin: + samples: + generate: + cmd: spellbook make-samples -n 25 -outfile=$(MERLIN_INFO)/samples.npy # (2) + file: $(MERLIN_INFO)/samples.npy # (3) + column_labels: [VAR_1, VAR_2] # (4) + ``` + + 1. Samples are referenced in steps using [token syntax](./variables.md#token-syntax) + 2. Generate 25 samples using [Merlin Spellbook](https://pypi.org/project/merlin-spellbook/) + 3. Tell Merlin where the sample files are stored + 4. Label the samples so that we can use them in our study with [token syntax](./variables.md#token-syntax) + +## The `user` Block + +!!! warning + + Any anchors/aliases you wish to use *must* be defined *before* you use them. For instance, if you want to use an alias in your `study` block then you must put the `user` block containing the anchor definition before the `study` block in your spec file. + +!!! tip + + This block is especially useful if you have a large chunk of code that's re-used in multiple steps. + +The `user` block allows other variables in the workflow file to be propogated through to the workflow. This block uses [YAML Anchors and Aliases](https://smcleod.net/2022/11/yaml-anchors-and-aliases/); anchors define a chunk of configuration and their alias is used to refer to that specific chunk of configuration elsewhere. + +To define an anchor, utilize the `&` syntax. For example, the following user block will define an anchor `python3_run`. The `python3_run` anchor creates a shorthand for running a simple print statement in Python 3: + +```yaml +user: + python3: + run: &python3_run + cmd: | + print("OMG is this in python3?") + shell: /usr/bin/env python3 +``` + +You can reference an anchor by utilizing the `<<: *` syntax to refer to its alias. Continuing with the example above, the following study block will reference the `python3_run` anchor: + +```yaml +study: + - name: *step_name + description: do something in python + run: + <<: *python3_run + task_queue: pyth3_q +``` + +Here we're merging the anchor `run` value with the existing values of `run`. Therefore, this step will be expanded to: + +```yaml +study: + - name: python3_hello + description: do something in python + run: + cmd: | + print("OMG is this in python3?") + shell: /usr/bin/env python3 + task_queue: pyth3_q +``` + +Notice that the existing `task_queue` value was not overridden. + +## Full Specification + +Below is a full YAML specification file for Merlin. To fully understand what's going on in this example spec file, see the [Feature Demo](../examples/feature_demo.md) page. + +```yaml +description: + name: $(NAME) + description: Run 10 hello worlds. + +batch: + type: local + +env: + variables: + OUTPUT_PATH: ./studies + N_SAMPLES: 10 + WORKER_NAME: demo_worker + VERIFY_QUEUE: default_verify_queue + NAME: feature_demo + + SCRIPTS: $(MERLIN_INFO)/scripts + HELLO: $(SCRIPTS)/hello_world.py + FEATURES: $(SCRIPTS)/features.json + +user: + study: + run: + hello: &hello_run + cmd: | + python3 $(HELLO) -outfile hello_world_output_$(MERLIN_SAMPLE_ID).json $(X0) $(X1) $(X2) + max_retries: 1 + python3: + run: &python3_run + cmd: | + print("OMG is this in python?") + print("Variable X2 is $(X2)") + shell: /usr/bin/env python3 + python2: + run: &python2_run + cmd: | + print "OMG is this in python2? Change is bad." + print "Variable X2 is $(X2)" + shell: /usr/bin/env python2 + +study: + - name: hello + description: | + process a sample with hello world + run: + <<: *hello_run + task_queue: hello_queue + + - name: collect + description: | + process the output of the hello world samples, extracting specific features; + run: + cmd: | + echo $(MERLIN_GLOB_PATH) + echo $(hello.workspace) + ls $(hello.workspace)/X2.$(X2)/$(MERLIN_GLOB_PATH)/hello_world_output_*.json > files_to_collect.txt + spellbook collect -outfile results.json -instring "$(cat files_to_collect.txt)" + depends: [hello_*] + task_queue: collect_queue + + - name: translate + description: | + process the output of the hello world samples some more + run: + cmd: spellbook translate -input $(collect.workspace)/results.json -output results.npz -schema $(FEATURES) + depends: [collect] + task_queue: translate_queue + + - name: learn + description: | + train a learner on the results + run: + cmd: spellbook learn -infile $(translate.workspace)/results.npz + depends: [translate] + task_queue: learn_queue + + - name: make_new_samples + description: | + make a grid of new samples to pass to the predictor + run: + cmd: spellbook make-samples -n $(N_NEW) -sample_type grid -outfile grid_$(N_NEW).npy + task_queue: make_samples_queue + + - name: predict + description: | + make a new prediction from new samples + run: + cmd: spellbook predict -infile $(make_new_samples.workspace)/grid_$(N_NEW).npy -outfile prediction_$(N_NEW).npy -reg $(learn.workspace)/random_forest_reg.pkl + depends: [learn, make_new_samples] + task_queue: predict_queue + + - name: verify + description: | + if learn and predict succeeded, output a dir to signal study completion + run: + cmd: | + if [[ -f $(learn.workspace)/random_forest_reg.pkl && -f $(predict.workspace)/prediction_$(N_NEW).npy ]] + then + touch FINISHED + exit $(MERLIN_SUCCESS) + else + exit $(MERLIN_SOFT_FAIL) + fi + depends: [learn, predict] + task_queue: $(VERIFY_QUEUE) + + - name: python3_hello + description: | + do something in python + run: + <<: *python3_run + task_queue: pyth3_q + + - name: python2_hello + description: | + do something in python2, because change is bad + run: + <<: *python2_run + task_queue: pyth2_hello + +global.parameters: + X2: + values : [0.5] + label : X2.%% + N_NEW: + values : [10] + label : N_NEW.%% + +merlin: + resources: + task_server: celery + overlap: False + workers: + $(WORKER_NAME): + args: -l INFO --concurrency 3 --prefetch-multiplier 1 -Ofair + samples: + generate: + cmd: | + cp -r $(SPECROOT)/scripts $(SCRIPTS) + + spellbook make-samples -n $(N_SAMPLES) -outfile=$(MERLIN_INFO)/samples.npy + # can be a file glob of numpy sample files. + file: $(MERLIN_INFO)/samples.npy + column_labels: [X0, X1] + level_max_dirs: 25 +``` \ No newline at end of file diff --git a/docs/user_guide/variables.md b/docs/user_guide/variables.md new file mode 100644 index 000000000..270d907e3 --- /dev/null +++ b/docs/user_guide/variables.md @@ -0,0 +1,254 @@ +# Variables + +There are a number of variables which can be placed in a Merlin spec file that can control workflow execution, such as via string expansion and control flow. + +!!! note + + Only user variables and `OUTPUT_PATH` may be reassigned or overridden from the command line. + +## Token Syntax + +Before we discuss what variables can be used with Merlin, let's first discuss the syntax for variables. + +Merlin follows [Maestro's minimalist token syntax](https://maestrowf.readthedocs.io/en/latest/Maestro/specification.html#tokens-maestros-minimal-workflow-dsl) for all variables. This includes [Reserved Variables](#reserved-variables), [User Variables](#user-variables), [Step Return Variables](#step-return-variables), [Parameters](./specification.md#the-globalparameters-block), and [Samples](./specification.md#samples). These variables are referenced in a spec using the `$(TOKEN_NAME)` syntax. + +!!! example "User Variable Example" + + A user variable can be defined in the `env` block of the spec as is discussed [below](#user-variables). In this example, we're setting a variable `MY_VARIABLE` to have the value `5`. + + ```yaml + env: + variables: + MY_VARIABLE: 5 + + study: + - name: my_step + description: example showcasing token syntax + run: + cmd: echo "The value of my variable is $(MY_VARIABLE)" + ``` + + If we ran this study, `my_step` would produce a `my_step.out` file containing the string "The value of my variable is 5". + +## Directory Structure Context + +The directory structure of Merlin output looks like this: + +``` +SPECROOT +└── + +... + +OUTPUT_PATH +└── MERLIN_WORKSPACE + ├── MERLIN_INFO + │ ├── .orig.yaml + │ ├── .partial.yaml + │ └── .expanded.yaml + ├── .workspace + └── WORKSPACE +``` + +## Reserved Variables + +Reserved variables are study variables that Merlin uses. They may be referenced within a spec file but typically not reassigned or overridden. There are three exceptions to this rule: `$(LAUNCHER)`, `$(VLAUNCHER)`, and `$(OUTPUT_PATH)`. All three of these variables *can* be modified. + +| Variable | Description | Example Expansion | +| -------- | ----------- | ----------------- | +|
$(LAUNCHER)
| Abstracts HPC scheduler specific job launching wrappers such as srun (Slurm). See [below](#the-launcher-and-vlauncher-variables) for more info. |
srun -N 1 -n 3
| +|
$(MERLIN_GLOB_PATH)
| All of the directories in a simulation tree as a glob (*) string. |
/\*/\*/\*/\*
| +|
$(MERLIN_INFO)
| Directory within `MERLIN_WORKSPACE` that holds the provenance specs and sample generation results. Commonly used to hold `samples.npy`. |
$(MERLIN_WORKSPACE)/merlin_info/
| +|
$(MERLIN_PATHS_ALL)
| A space delimited string of all of the paths; can be used as is in bash for loop for instance with:
for path in $(MERLIN_PATHS_ALL)
do
ls $path
done
|
0/0/0
0/0/1
0/0/2
0/0/3
| +|
$(MERLIN_SAMPLE_ID)
| Sample index in an ensemble. |
0 1 2 3
| +|
$(MERLIN_SAMPLE_NAMES)
| Names of merlin sample values. |
SAMPLE_COLUMN_1 SAMPLE_COLUMN_2 ...
| +|
$(MERLIN_SAMPLE_PATH)
| Path in the sample directory tree to a sample's directory, i.e. where the task is actually run. |
/0/0/0/ /0/0/1/ /0/0/2/ /0/0/3/
| +|
$(MERLIN_SAMPLE_VECTOR)
| Vector of merlin sample values. |
$(SAMPLE_COLUMN_1) $(SAMPLE_COLUMN_2) ...
| +|
$(MERLIN_SPEC_ARCHIVED_COPY)
| Archive version of `MERLIN_SPEC_EXECUTED_RUN` with all variables and paths fully resolved. |
$(MERLIN_INFO)/*.expanded.yaml
| +|
$(MERLIN_SPEC_EXECUTED_RUN)
| Parsed and processed yaml file with command-line variable substitutions included. |
$(MERLIN_INFO)/*.partial.yaml
| +|
$(MERLIN_SPEC_ORIGINAL_TEMPLATE)
| Copy of original yaml file passed to `merlin run`. |
$(MERLIN_INFO)/*.orig.yaml
| +|
$(MERLIN_TIMESTAMP)
| The time a study began. May be used as a unique identifier. |
"YYYYMMDD-HHMMSS"
| +|
$(MERLIN_WORKSPACE)
| Output directory generated by a study at `OUTPUT_PATH`. Ends with `MERLIN_TIMESTAMP`. |
$(OUTPUT_PATH)/ensemble_name_$(MERLIN_TIMESTAMP)
| +|
$(OUTPUT_PATH)
| Directory path that the study output will be written to. If not defined this will default to the current working directory. This value May be reassigned or overridden. |
./studies
| +|
$(SPECROOT)
| Directory path of the specification file. |
/globalfs/user/merlin_workflows
| +|
$(VLAUNCHER)
| The same as `$(LAUNCHER)` but allows for shell variable substitution. See [below](#the-launcher-and-vlauncher-variables) for more info. |
srun -N 1 -n 3
| +|
$(WORKSPACE)
| The workspace directory for the current step. |
$(OUTPUT_PATH)/ensemble_name_$(MERLIN_TIMESTAMP)/current_step_name/
| +|
$(.workspace)
| Can be used in a step to reference path to other previous step workspaces.

Note

`step_name` is the `name` key in each study step.

|
$(OUTPUT_PATH)/ensemble_name_$(MERLIN_TIMESTAMP)/step_name/
| + +### The `LAUNCHER` and `VLAUNCHER` Variables + +`$(LAUNCHER)` is a special case of a reserved variable since it's value *can* be changed. It serves as an abstraction to launch a job with parallel schedulers like [Slurm](../faq.md#what-is-slurm), [LSF](../faq.md#what-is-lsf), and [Flux](../faq.md#what-is-flux), and it can be used within a step command. + +The arguments that the `LAUNCHER` variable can use are: + +| Argument Name | Description | +| ------------- | ----------- | +| `procs` | The total number of MPI tasks | +| `nodes` | The total number of MPI nodes | +| `walltime` | The total walltime of the run (hh\:mm\:ss or mm:ss or ss) **(not available in LSF)** | +| `cores per task` | The number of hardware threads per MPI task | +| `gpus per task` | The number of GPUs per MPI task | + +!!! example "LAUNCHER Example" + + Let's say we start with this run command inside our step: + + ```yaml + study: + - name: LAUNCHER example + description: An example step showcasing the LAUNCHER variable + run: + cmd: srun -N 1 -n 3 python script.py + ``` + + We can modify this to use the `$(LAUNCHER)` variable like so: + + ```yaml + batch: + type: slurm + + study: + - name: LAUNCHER example + description: An example step showcasing the LAUNCHER variable + run: + cmd: $(LAUNCHER) python script.py + nodes: 1 + procs: 3 + ``` + + In other words, the `$(LAUNCHER)` variable here would be expanded to: + + ```bash + srun -N 1 -n 3 + ``` + +Similarly, the `$(VLAUNCHER)` variable behaves almost the same way as the `$(LAUNCHER)` variable. The key distinction lies in its source of information. Instead of drawing certain configuration options from the `run` section of a step, it retrieves specific shell variables. These shell variables are automatically generated by Merlin when you include the `$(VLAUNCHER)` variable in a step command, but they can also be customized by the user. + +Currently, the following shell variables are: + +| Variable | Description | Default | +| ----------------- | --------------------------------- | ------- | +| `${MERLIN_NODES}` | The number of nodes | 1 | +| `${MERLIN_PROCS}` | The number of tasks/procs | 1 | +| `${MERLIN_CORES}` | The number of cores per task/proc | 1 | +| `${MERLIN_GPUS}` | The number of gpus per task/proc | 0 | + +!!! example "VLAUNCHER Example" + + Let's say we have the following defined in our yaml file: + + ```yaml + batch: + type: flux + + study: + - name: VLAUNCHER example + description: An example step showcasing the VLAUNCHER variable + run: + cmd: | + MERLIN_NODES=4 + MERLIN_PROCS=2 + MERLIN_CORES=8 + MERLIN_GPUS=2 + $(VLAUNCHER) python script.py + ``` + + The `$(VLAUNCHER)` variable here would be expanded to: + + ```bash + flux run -N 4 -n 2 -c 8 -g 2 + ``` + +## User Variables + +User variables are variables defined in the `env` section of a spec file, as in this example: + +```yaml +env: + variables: + ID: 42 + EXAMPLE_VAR: hello +``` + +As long as they're defined in order, you can nest user variables like so: + +```yaml +env: + variables: + EXAMPLE_VAR: hello + WORKER_NAME: $(EXAMPLE_VAR)_worker +``` + +Like all other Merlin variables, user variables may be used anywhere (as a yaml key or value) within a specification as below: + +```yaml hl_lines="5 10" +study: + - name: VLAUNCHER example + description: An example step showcasing the VLAUNCHER variable + run: + cmd: echo "$(EXAMPLE_VAR), world!" + +merlin: + resources: + workers: + $(WORKER_NAME): + args: -l INFO + steps: [all] +``` + +If you want to programmatically define the study name, you can include variables in the `description.name` field as long as it makes a valid filename: + +```yaml +description: + name: my_$(EXAMPLE_VAR)_study_$(ID) + description: example of programmatic study name +``` + +The above would produce a study called `my_hello_study_42`. + +## Environment Variables + +Merlin expands Unix environment variables for you. The values of the user variables below would be expanded: + +```yaml +env: + variables: + MY_HOME: ~/ + MY_PATH: $PATH + USERNAME: ${USER} +``` + +However, Merlin leaves environment variables found in shell scripts (think `cmd` and `restart`) alone. So this step: + +```yaml +study: + - name: step1 + description: an example + run: + cmd: echo $PATH ; echo $(MY_PATH) +``` + +...would be expanded as: + +```yaml +study: + - name: step1 + description: an example + run: + cmd: echo $PATH ; echo /an/example/:/path/string/ +``` + +## Step Return Variables + +When a Merlin step finishes executing, a return code is provided by Merlin behind the scenes. This return code is used to determine what to do upon step completion. + +If necessary, users can raise their own return codes within steps. The table below lists all Merlin return codes and an example of how to raise each one. + +| Variable | Description | Example Usage | +| -------- | ----------- | ------------- | +|
`$(MERLIN_SUCCESS)`
| This step was successful. Keep going to the next task. **Default step behavior if no exit code given.** |
echo "hello, world!"
exit $(MERLIN_SUCCESS)
| +|
`$(MERLIN_RESTART)`
| Run this step’s `restart` command, or re-run `cmd` if `restart` is absent. The default maximum number of retries+restarts for any given step is 30. You can override this by adding a `max_retries` field under the run field in the specification. Issues a warning. Default will retry in 1 second. To override the delay time, specify `retry_delay`. |
run:
cmd: \|
touch my_file.txt
echo "hi mom!" >> my_file.txt
exit $(MERLIN_RESTART)
restart: \|
echo "bye, mom!" >> my_file.txt
max_retries: 23
retry_delay: 10
| +|
`$(MERLIN_RETRY)`
| Retry this step's `cmd` command. The default maximum number of retries for any given step is 30. You can override this by adding a `max_retries` field under the run field in the specification. Issues a warning. Default will retry in 1 second. To override the delay time, specify `retry_delay`. |
run:
cmd: \|
touch my_file.txt
echo "hi mom!" >> my_file.txt
exit $(MERLIN_RETRY)
max_retries: 23
retry_delay: 10
| +|
`$(MERLIN_SOFT_FAIL)`
| Mark this step as a failure, note in the warning log but keep executing the workflow. Unknown return codes get translated to soft fails, so that they can be logged. |
echo "Uh-oh, this sample didn't work"
exit $(MERLIN_SOFT_FAIL)
| +|
`$(MERLIN_HARD_FAIL)`
| Something went terribly wrong and we need to stop the whole workflow. Raises a `HardFailException` and stops all workers connected to that step. Workers will stop after a 60 second delay to allow the step to be acknowledged by the server.

Note

Workers in isolated parts of the workflow not consuming from the bad step will continue. you can stop all workers with `$(MERLIN_STOP_WORKERS)`

|
echo "Oh no, we've created skynet! Abort!"
exit $(MERLIN_HARD_FAIL)
| +|
`$(MERLIN_STOP_WORKERS)`
| Launch a task to stop all active workers. To allow the current task to finish and acknowledge the results to the server, will happen in 60 seconds. |
# send a signal to all workers to stop
exit $(MERLIN_STOP_WORKERS)
| diff --git a/merlin/main.py b/merlin/main.py index 0ee2e36ce..442092924 100644 --- a/merlin/main.py +++ b/merlin/main.py @@ -486,7 +486,7 @@ def setup_argparse() -> None: # pylint: disable=R0915 ) restart.set_defaults(func=process_restart) restart.add_argument("restart_dir", type=str, help="Path to an existing Merlin workspace directory") - restart.add_argument( + restart.add_argument( # TODO should this just be boolean instead of store_const? "--local", action="store_const", dest="run_mode", diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..5843f9d0a --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,126 @@ +site_name: Merlin + +nav: + - Merlin: "index.md" + - Tutorial: + - Overview: "tutorial/index.md" + - 0. Prerequisites: "tutorial/0_prerequisites.md" + - 1. Introduction: "tutorial/1_introduction.md" + - 2. Installation: "tutorial/2_installation.md" + - 3. Hello, World!: "tutorial/3_hello_world.md" + - 4. Run a Real Simulation: "tutorial/4_run_simulation.md" + - 5. Advanced Topics: "tutorial/5_advanced_topics.md" + - 6. Contribute to Merlin: "tutorial/6_contribute.md" + - 7. Port Your Own Application: "tutorial/7_port_application.md" + - User Guide: + - Overview: "user_guide/index.md" + - Installation: "user_guide/installation.md" + - Configuration: + - Overview: "user_guide/configuration/index.md" + - External Server: "user_guide/configuration/external_server.md" + - Merlin Server: "user_guide/configuration/merlin_server.md" + - Command Line Interface: "user_guide/command_line.md" + - Specification: "user_guide/specification.md" + - Variables: "user_guide/variables.md" + - Running Studies: "user_guide/running_studies.md" + - Interpreting Output: "user_guide/interpreting_output.md" + - Celery: "user_guide/celery.md" + - Docker: "user_guide/docker.md" + - Contributing: "user_guide/contributing.md" + - Examples: + - Overview: "examples/index.md" + - Hello World Examples: "examples/hello.md" + - Feature Demo: "examples/feature_demo.md" + - Iterative Demo: "examples/iterative.md" + - Restart Examples: "examples/restart.md" + - HPC Examples: "examples/hpc.md" + - Flux Examples: "examples/flux.md" + - Slurm Examples: "examples/slurm.md" + - LSF Examples: "examples/lsf.md" + - FAQ: "faq.md" + - Reference Guide: + - Merlin Reference: "api_reference/index.md" + - API Reference: "api_reference/" + - Contact Us: "contact.md" + +theme: + name: material + language: en + logo: assets/images/merlin_icon.png + features: + - header.autohide + - navigation.tabs + - navigation.tabs.sticky + - navigation.top + - navigation.instant + - navigation.indexes + - search.suggest + - search.highlight + - content.code.annotate + - content.code.copy + palette: + - media: "(prefers-color-scheme: dark)" + scheme: slate + toggle: + icon: material/toggle-switch-off-outline + name: Switch to light mode + primary: black + accent: deep orange + - media: "(prefers-color-scheme: light)" + scheme: default + toggle: + icon: material/toggle-switch + name: Switch to dark mode + primary: black + accent: deep orange + +markdown_extensions: + - admonition + - attr_list + - md_in_html + - footnotes + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences + - pymdownx.details + - pymdownx.tabbed: + alternate_style: true + - markdown_grid_tables + +plugins: + - glightbox + - search + - codeinclude: + title_mode: pymdownx.tabbed + # - gen-files: + # scripts: + # - docs/gen_ref_pages.py + # - mkdocstrings: + # handlers: + # python: + # paths: [merlin] + # options: + # docstring_style: sphinx + # - literate-nav: + # nav_file: SUMMARY.md + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/LLNL/merlin + name: Merlin on GitHub + +extra_css: + - assets/stylesheets/extra.css + +extra_javascript: + - assets/javascripts/swap_lp_image.js + +copyright: Copyright © 2024 | Lawrence Livermore National Security