-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a13ac1e
commit cb3470d
Showing
1 changed file
with
220 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,220 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"## Setup\n", | ||
"1. Place this file into a folder that contains the dataset folders you want to merge (and no other folders).\n", | ||
"2. Specify the name fo the new dataset (`NEW_DATASET_NAME`). This will create a new folder for the merged dataset. This folder will be overwritten on every run.\n", | ||
"3. Specify how to shard the new merged dataset (how many samples to save per file: `NEW_SAMPLES_PER_FILE`).\n", | ||
"\n", | ||
"\n", | ||
"> NOTE:\n", | ||
"> all variable definitions (names, dimensions, domains etc) need to match across all datasets!\n", | ||
"\n", | ||
"> NOTE:\n", | ||
"> domains of the performance attributes will be updated to match the merged data!" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"NEW_DATASET_NAME = \"merged\"\n", | ||
"NEW_SAMPLES_PER_FILE = 100" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# -----------------------------------------------------------------------------\n", | ||
"# RUN!\n", | ||
"No need to modify anything else in this file, but you may want to inspect the outputs and printouts to check everything is as expected." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from aixd.data.encoders import json_load\n", | ||
"import os\n", | ||
"import pandas as pd\n", | ||
"from aixd.data import Dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"root_folder = os.getcwd()\n", | ||
"dataset_names = [name for name in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, name)) and name != NEW_DATASET_NAME]\n", | ||
"print(f\"Found following subfolders: {dataset_names}\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Merge data\n", | ||
"Reads all data and combines them into one dataframe \n", | ||
"(all datasets, design parameters+performance attributes)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def load_df(root_folder, dataset_name):\n", | ||
" # Load old sharded data from pickled dataframes\n", | ||
"\n", | ||
" # DPs\n", | ||
" directory = os.path.join(root_folder, dataset_name, \"design_parameters\")\n", | ||
" df_dp_all = []\n", | ||
"\n", | ||
" for filename in os.listdir(directory):\n", | ||
" if filename.endswith(\".pkl\"):\n", | ||
" filepath = os.path.join(directory, filename)\n", | ||
" df = pd.read_pickle(filepath)\n", | ||
" df_dp_all.append(df)\n", | ||
"\n", | ||
" df_dp_all = pd.concat(df_dp_all, axis=0)\n", | ||
"\n", | ||
" # PAs\n", | ||
" directory = os.path.join(root_folder, dataset_name, \"performance_attributes\")\n", | ||
" df_pa_all = []\n", | ||
"\n", | ||
" for filename in os.listdir(directory):\n", | ||
" if filename.endswith(\".pkl\"):\n", | ||
" filepath = os.path.join(directory, filename)\n", | ||
" df = pd.read_pickle(filepath)\n", | ||
" df_pa_all.append(df)\n", | ||
"\n", | ||
" df_pa_all = pd.concat(df_pa_all, axis=0)\n", | ||
" df_all = pd.merge(df_dp_all, df_pa_all, how=\"inner\", on=[\"uid\"])\n", | ||
" df_all = df_all.drop(columns=[\"uid\"])\n", | ||
" return df_all" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dfs = []\n", | ||
"for dataset_name in dataset_names:\n", | ||
" df = load_df(root_folder, dataset_name)\n", | ||
" dfs.append(df)\n", | ||
"\n", | ||
"df_all = pd.concat(dfs)\n", | ||
" \n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Preview the dataframe containing the merged data. \n", | ||
"The last column \"error\" is for internal purposes, not part of the variables." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df_all" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Dataset object\n", | ||
"\n", | ||
"Restores a Dataset object from one of the datasets' json files\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"picked_dataset_object_path = os.path.join(root_folder, dataset_names[0], \"dataset_object.json\")\n", | ||
"dataset_temp = Dataset.from_dataset_object(picked_dataset_object_path)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dataset_new = Dataset( name=NEW_DATASET_NAME,\n", | ||
" root_path=root_folder,\n", | ||
" file_format=\"json\",\n", | ||
" design_par=dataset_temp.design_par,\n", | ||
" perf_attributes=dataset_temp.perf_attributes,\n", | ||
" overwrite= True)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dataset_temp.summary_dataobjects()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Import data from dataframe" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"dataset_new.import_data_from_df(df_all, samples_perfile=NEW_SAMPLES_PER_FILE, flag_fromscratch=True)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "ara-adml2", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.20" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |