#66 notebook to merge datasets

gramaziokohler · Feb 13, 2025 · cb3470d · cb3470d
1 parent a13ac1e
commit cb3470d
Showing 1 changed file with 220 additions and 0 deletions.
diff --git a/scripts/merge_datasets.ipynb b/scripts/merge_datasets.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "1. Place this file into a folder that contains the dataset folders you want to merge (and no other folders).\n",
+    "2. Specify the name fo the new dataset (`NEW_DATASET_NAME`). This will create a new folder for the merged dataset. This folder will be overwritten on every run.\n",
+    "3. Specify how to shard the new merged dataset (how many samples to save per file: `NEW_SAMPLES_PER_FILE`).\n",
+    "\n",
+    "\n",
+    "> NOTE:\n",
+    "> all variable definitions (names, dimensions, domains etc) need to match across all datasets!\n",
+    "\n",
+    "> NOTE:\n",
+    "> domains of the performance attributes will be updated to match the merged data!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NEW_DATASET_NAME = \"merged\"\n",
+    "NEW_SAMPLES_PER_FILE = 100"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# -----------------------------------------------------------------------------\n",
+    "# RUN!\n",
+    "No need to modify anything else in this file, but you may want to inspect the outputs and printouts to check everything is as expected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from aixd.data.encoders import json_load\n",
+    "import os\n",
+    "import pandas as pd\n",
+    "from aixd.data import Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root_folder = os.getcwd()\n",
+    "dataset_names = [name for name in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, name)) and name != NEW_DATASET_NAME]\n",
+    "print(f\"Found following subfolders: {dataset_names}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Merge data\n",
+    "Reads all data and combines them into one dataframe \n",
+    "(all datasets, design parameters+performance attributes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_df(root_folder, dataset_name):\n",
+    "    # Load old sharded data from pickled dataframes\n",
+    "\n",
+    "    # DPs\n",
+    "    directory = os.path.join(root_folder, dataset_name, \"design_parameters\")\n",
+    "    df_dp_all = []\n",
+    "\n",
+    "    for filename in os.listdir(directory):\n",
+    "        if filename.endswith(\".pkl\"):\n",
+    "            filepath = os.path.join(directory, filename)\n",
+    "            df = pd.read_pickle(filepath)\n",
+    "            df_dp_all.append(df)\n",
+    "\n",
+    "    df_dp_all = pd.concat(df_dp_all, axis=0)\n",
+    "\n",
+    "    # PAs\n",
+    "    directory = os.path.join(root_folder, dataset_name, \"performance_attributes\")\n",
+    "    df_pa_all = []\n",
+    "\n",
+    "    for filename in os.listdir(directory):\n",
+    "        if filename.endswith(\".pkl\"):\n",
+    "            filepath = os.path.join(directory, filename)\n",
+    "            df = pd.read_pickle(filepath)\n",
+    "            df_pa_all.append(df)\n",
+    "\n",
+    "    df_pa_all = pd.concat(df_pa_all, axis=0)\n",
+    "    df_all = pd.merge(df_dp_all, df_pa_all, how=\"inner\", on=[\"uid\"])\n",
+    "    df_all = df_all.drop(columns=[\"uid\"])\n",
+    "    return df_all"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dfs = []\n",
+    "for dataset_name in dataset_names:\n",
+    "    df = load_df(root_folder, dataset_name)\n",
+    "    dfs.append(df)\n",
+    "\n",
+    "df_all = pd.concat(dfs)\n",
+    "    \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Preview the dataframe containing the merged data. \n",
+    "The last column \"error\" is for internal purposes, not part of the variables."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_all"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Dataset object\n",
+    "\n",
+    "Restores a Dataset object from one of the datasets' json files\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "picked_dataset_object_path = os.path.join(root_folder, dataset_names[0], \"dataset_object.json\")\n",
+    "dataset_temp = Dataset.from_dataset_object(picked_dataset_object_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_new = Dataset(  name=NEW_DATASET_NAME,\n",
+    "                        root_path=root_folder,\n",
+    "                        file_format=\"json\",\n",
+    "                        design_par=dataset_temp.design_par,\n",
+    "                        perf_attributes=dataset_temp.perf_attributes,\n",
+    "                        overwrite= True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_temp.summary_dataobjects()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import data from dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset_new.import_data_from_df(df_all, samples_perfile=NEW_SAMPLES_PER_FILE, flag_fromscratch=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ara-adml2",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}