Skip to content

Commit

Permalink
Merge pull request #72 from hydroshare/71-csv-content-type-support
Browse files Browse the repository at this point in the history
csv content type support
  • Loading branch information
pkdash authored Nov 12, 2024
2 parents 7b7ad2f + 58a4149 commit 8c65a61
Show file tree
Hide file tree
Showing 30 changed files with 3,859 additions and 105 deletions.
1 change: 1 addition & 0 deletions docs/api/csv.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
::: hsclient.hydroshare.CSVAggregation
168 changes: 166 additions & 2 deletions docs/examples/Aggregation_Data_Object_Operations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
" * Time series\n",
" * Geographic feature\n",
" * Geographic raster\n",
" * Multidimensional NetCDF"
" * Multidimensional NetCDF\n",
" * CSV"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -84,8 +85,9 @@
"* Geographic feature : fiona.Collection\n",
"* Geographic raster : rasterio.DatasetReader\n",
"* Multidimensional NetCDF : xarray.Dataset\n",
"* CSV : pandas.DataFrame\n",
"\n",
"In the following code examples, we are assuming that we have a resource in HydroShare that contains the above four aggregation types. All these aggregations are at the root of the resource. The resource id used in the following code examples is \"a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b\". You will need to change this resource id to the id of your resource in HydroShare.\n"
"In the following code examples, we are assuming that we have a resource in HydroShare that contains the above five aggregation types. All these aggregations are at the root of the resource. The resource id used in the following code examples is \"a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b\". You will need to change this resource id to the id of your resource in HydroShare.\n"
],
"metadata": {
"collapsed": false
Expand Down Expand Up @@ -936,6 +938,168 @@
},
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"### Loading CSV Data to pandas.DataFrame\n",
"Here we are assuming the CSV aggregation contains a CSV file with name \"sample.csv\" "
]
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# retrieve the CSV aggregation\n",
"file_path = \"sample.csv\"\n",
"csv_aggr = resource.aggregation(file__path=file_path)"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# show the aggregation type\n",
"print(f\"Aggregation Type:{csv_aggr.metadata.type}\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# download the CSV aggregation - these directory paths must exist for hsclient to download and unzip the aggregation zip file\n",
"# Note: These directory paths need to be changed based on where you want to download the aggregation\n",
"download_to = os.path.join(base_working_dir, \"csv_testing\")\n",
"unzip_to = os.path.join(download_to, \"aggr_unzipped\")\n",
"aggr_path = resource.aggregation_download(aggregation=csv_aggr, save_path=download_to, unzip_to=unzip_to)\n",
"print(f\"Downloaded aggregation to:{aggr_path}\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# load the CSV aggregation as pandas.DataFrame\n",
"csv_df = csv_aggr.as_data_object(agg_path=aggr_path)"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# show number of rows and columns\n",
"print(f\"Number of data rows:{len(csv_df)}\")\n",
"print(f\"Number of data columns:{len(csv_df.columns)}\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# show the first 5 data rows\n",
"print(csv_df.head(5))"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# show the extracted CSV aggregation metadata (table schema)\n",
"table_schema = csv_aggr.metadata.tableSchema\n",
"table = table_schema.table\n",
"print(f\"Number of data rows:{table_schema.rows}\")\n",
"print(f\"Number of data columns:{len(table.columns)}\")\n",
"print(f\"Delimiter:{table_schema.delimiter}\")\n",
"\n",
"# show data column properties\n",
"for col in table.columns:\n",
" print(f\"Column number:{col.column_number}\")\n",
" print(f\"Column title:{col.title}\")\n",
" print(f\"Column description:{col.description}\")\n",
" print(f\"Column data type:{col.datatype}\")\n",
" print(\"-\"*50) "
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": "***Editing CSV aggregation using pandas.DataFrame***"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# drop the last data column - note all editing needs to be in 'inplace' mode\n",
"csv_df.drop(csv_df.columns[-1], axis=1, inplace=True)\n",
"# show the number of data columns after the edit\n",
"print(f\"Number of data columns after edit:{len(csv_df.columns)}\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# save the updated CSV aggregation in Hydroshare\n",
"# Note this will overwrite the original aggregation - this operation may take a while\n",
"csv_aggr = csv_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=False)\n",
"print(\"Aggregation updated ...\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# we can also create a new CSV aggregation in HydroShare using the updated pandas.DataFrame object\n",
"# we first create a new folder in which the new aggregation will be created\n",
"aggr_folder = \"csv_folder\"\n",
"resource.folder_create(folder=aggr_folder)\n",
"\n",
"# this operation may take a while\n",
"csv_aggr = csv_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=True, destination_path=aggr_folder)\n",
"print(\"New CSV aggregation was created ...\")"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"# retrieve the updated CSV aggregation to verify the data got updated\n",
"download_to = os.path.join(base_working_dir, \"csv_testing\")\n",
"\n",
"# note the unzip_to directory must exist and be empty\n",
"unzip_to = os.path.join(download_to, \"aggr_unzipped\")\n",
"aggr_path = resource.aggregation_download(aggregation=csv_aggr, save_path=download_to, unzip_to=unzip_to)\n",
"csv_df = csv_aggr.as_data_object(agg_path=aggr_path)\n",
"\n",
"# show the number of data rows and columns\n",
"print(f\"Number of data rows:{len(csv_df)}\")\n",
"print(f\"Number of data columns:{len(csv_df.columns)}\")\n",
"# show the first 5 data rows\n",
"print(csv_df.head(5))"
],
"outputs": [],
"execution_count": null
}
],
"metadata": {
Expand Down
Loading

0 comments on commit 8c65a61

Please sign in to comment.