mir-dataset-loaders · stefansjs · Dec 7, 2024 · Dec 7, 2024 · Dec 7, 2024 · Dec 11, 2024
diff --git a/README.md b/README.md
@@ -41,6 +41,12 @@ orchset.validate()  # validate that all the expected files are there
 example_track = orchset.choice_track()  # choose a random example track
 print(example_track)  # see the available data
 ```
+
+Or using the CLI:
+```bash
+python -m mirdata orchset  # download and validate the dataset
+```
+
 See the [documentation](https://mirdata.readthedocs.io/) for more examples and the API reference.
 
 

diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
@@ -9,21 +9,81 @@ Installation
 
 To install Mirdata:
 
-    .. code-block:: console
+.. code-block:: console
 
-        pip install mirdata
+    pip install mirdata
 
 We recommend to do this inside a conda or virtual environment for reproducibility.
 
+Command-Line Interface
+----------------------
+
+To get started with mirdata, execute the following in your console:
+
+.. code-block:: console
+
+    python -m mirdata --help
+
+Print a list of all available dataset loaders by calling:
+
+.. code-block:: console
+
+    python -m mirdata --list
+    python -m mirdata -l
+    python -m mirdate  # If you don't specify a dataset, it defaults to listing datasets
+
+Download one or more datasets by specifying their name as arguments
+
+.. code-block:: console
+
+    python -m mirdata orchset maestro
+
+You can specify which version of a dataset you'd like to download with `--version`
+
+.. code-block:: console
+
+    python -m mirdata maestro --version=2.0.0
+    python -m mirdata maestro -v 2.0.0
+
+mirdata will choose a default location for all MIR datasets. If you'd like to redirect the download destination, you
+can do the following
+
+.. code-block:: console
+
+    python -m mirdata maestro --output /opt/data/mir/maestro
+
+By default mirdata will validate any downloaded dataset(s). If you want to skip validation you can use `--no-validate`
+
+.. code-block:: console
+
+    python -m mirdata maestro --no-validate
+
+Or you can print just citations or licenses with `--citation` and `--license`.
+
+.. code-block:: console
+
+    python -m mirdata maestro --citation --license
+    python -m mirdata maestro -c -L  #equivalent to above
+
+When you ask for either a license or a citation, mirdata will not download/validate the dataset. If you want to
+download _and_ print a license or citation, you can add the --download flag
+
+.. code-block:: console
+
+    python -m mirdata maestro --citation --download
+    python -m mirdata maestro -cd
+
+
+Initializing a dataset
+^^^^^^^^^^^^^^^^^^^^^^
+
 Mirdata is easily imported into your Python code by:
 
 .. code-block:: python
 
     import mirdata
 
 
-Initializing a dataset
-^^^^^^^^^^^^^^^^^^^^^^
 
 Print a list of all available dataset loaders by calling:
 
@@ -37,7 +97,7 @@ To use a loader, (for example, ``orchset`) you need to initialize it by calling:
 .. code-block:: python
 
     import mirdata
-    orchset = mirdata.initialize('orchset', data_home='/choose/where/data/live')
+    orchset = mirdata.initialize('orchset', data_home='/choose/where/data/lives')
 
 Now ``orchset`` is a ``Dataset`` object containing common methods, described below.
 
@@ -50,7 +110,7 @@ Use ``version`` parameter if you wish to use a version other than the default on
 .. code-block:: python
 
     import mirdata
-    dataset = mirdata.initialize('orchset', data_home='/choose/where/data/live', version="1.0")
+    dataset = mirdata.initialize('orchset', data_home='/choose/where/data/lives', version="1.0")
 
 
 Downloading a dataset

diff --git a/mirdata/__main__.py b/mirdata/__main__.py
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+"""
+Command-line interface for mirdata
+
+Examples:
+
+    list available datasets
+    $ python -m mirdata --list
+
+    download one dataset, but skip validation
+    $ python -m mirdata 'orchset' --no-validate
+
+    download one or more datasets
+    $ python -m mirdata orchset gtzan_genre
+"""
+import argparse
+import logging
+from pathlib import Path
+
+from mirdata.core import Dataset
+from . import list_datasets, initialize
+
+logger = logging.getLogger('mirdata')
+
+
+def build_cli(parser=None):
+    """ Builds the command-line interface by append arguments to an argparser. """
+    if parser is None:
+        parser = argparse.ArgumentParser()
+
+    parser.add_argument('dataset', nargs='*',
+                        help="name of the dataset to download/validate")
+    parser.add_argument('--output', '-o', default=None, type=Path,
+                        help=f"target directory where datasets will be downloaded to (default: {Dataset.DEFAULT_DIR()})")
+    parser.add_argument('--list', '-l', action='store_true',
+                        help="list all available datasets")
+    parser.add_argument('--no-validate', dest='validate', action='store_false',
+                        help='skip dataset validation')
+    parser.add_argument('--force', '-f',
+                        help="overwrite dataset if it exists")
+    parser.add_argument('--version', '-v', default='default',
+                        help="dataset version")
+    parser.add_argument('--citation', '-c', action='store_true',
+                        help="Only print the citation, don't download. Can be combined with --license and --download")
+    parser.add_argument('--license', '-L', action='store_true',
+                        help="Only print the license, don't download. Can be combined with --citation and --download")
+    parser.add_argument("--download", "-d", action='store_true', default=None)
+
+    return parser
+
+
+def main(dataset, list=False, data_home=None, force=False, version='default', **kwargs):
+    if list or len(dataset) == 0:
+        _list_datasets_to_console(data_home)
+        return
+
+    if len(dataset) > 1:
+        print(f"Preparing download of {dataset}")
+
+    succeeded, failed = [], []
+    for d in dataset:
+        try:
+            _download_one(d, force=force, data_home=data_home, version=version, **kwargs)
+        except Exception:
+            logger.error("Failed to download dataset: %s", d, exc_info=True)
+            failed.append(d)
+        else:
+            succeeded.append(d)
+
+    if failed:
+        print("Failed to download datasets:")
+        print(", ".join(failed))
+
+    return len(failed)
+
+
+def _download_one(dataset, force=False, data_home=None, version=None,
+                  download=None, validate=None, citation=False, license=False,
+                  **kwargs):
+    print(f"Preparing {dataset} version={version or 'default'}")
+    dataset = initialize(dataset, data_home=data_home, version=version)
+
+    if download is None and (citation or license):
+        download = False
+    else:
+        download = True
+
+    if validate is None and download:
+        validate = True
+
+    if download:
+        dataset.download(force_overwrite=force)
+    if validate:
+        dataset.validate()
+    if license:
+        dataset.license()
+    if citation:
+        dataset.cite()
+
+
+
+def _list_datasets_to_console(downloaded:Path=None):
+    if downloaded is not None and downloaded.exists():
+        print("Downloaded datasets")
+        print("-------------------")
+        subdirectories = filter(Path.is_dir, downloaded.iterdir())
+        print("\n".join(map(str, subdirectories)))
+        print("\n\nAvailable datasets")
+        print("--------------------")
+
+    print("\n".join(list_datasets()))
+
+
+if __name__ == '__main__':
+    import logging, sys
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+
+    parser = build_cli()
+    args = parser.parse_args()
+
+    return_code = main(**vars(args))
+    sys.exit(return_code)
diff --git a/mirdata/core.py b/mirdata/core.py
@@ -192,8 +192,11 @@ def default_path(self):
             str: Local path to the dataset
 
         """
-        mir_datasets_dir = os.path.join(os.getenv("HOME", "/tmp"), "mir_datasets")
-        return os.path.join(mir_datasets_dir, self.name)
+        return os.path.join(self._default_dir(), self.name)
+
+    @classmethod
+    def _default_dir(cls):
+        return os.path.join(os.getenv("HOME", tempfile.gettempdir()), "mir_datasets")
 
     def _track(self, track_id):
         """Load a track by track_id.

diff --git a/mirdata/download_utils.py b/mirdata/download_utils.py
@@ -212,7 +212,7 @@ def download_from_remote(remote, save_dir, force_overwrite, allow_invalid_checks
         str: Full path of the created file.
 
     """
-    file_uri = parse_uri(save_dir)
+    file_uri = parse_uri(str(save_dir))
     if file_uri.scheme != "file":
         raise NotImplementedError(
             "mirdata only supports downloading to a local filesystem. "

diff --git a/mirdata/version.py b/mirdata/version.py
@@ -2,4 +2,4 @@
 """Version info"""
 
 short_version = "0.3"
-version = "0.3.9"
+version = "0.3.10a"
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,11 +26,11 @@ dependencies = [
     "chardet>=5.0.0",
     "Deprecated>=1.2.14",
     "h5py>=3.7.0",
-    "jams>=0.3.4",
+    "jams>0.3.4",
     "librosa>=0.10.1",
     "numpy>=1.21.6,<2.0",
     "pandas>=1.3.5",
-    "pretty_midi>=0.2.10",
+    "pretty_midi>0.2.10",
     "pyyaml>=6.0",
     "openpyxl>=3.0.10",
     "requests>=2.31.0",
@@ -77,9 +77,13 @@ cipi = ["music21==6.7.1"]
 gcs = ["smart_open[gcs]"]
 s3 = ["smart_open[s3]"]
 http = ["smart_open[http]"]
+all = ["openpyxl==3.0.10", "dali-dataset==1.1", "music21==6.7.1", "smart_open[gcs,s3,http]"]
 
 [project.urls]
 Homepage = "https://github.com/mir-dataset-loaders/mirdata"
 Documentation = "https://mirdata.readthedocs.io/en/stable/"
 Issues = "https://github.com/mir-dataset-loaders/mirdata/issues"
-Releases = "https://github.com/mir-dataset-loaders/mirdata/releases"
+Releases = "https://github.com/mir-dataset-loaders/mirdata/releases"
+
+[tool.pytest.ini_options]
+testpaths = ['tests']