Merge pull request #72 from hydroshare/71-csv-content-type-support

csv content type support
hydroshare · Nov 12, 2024 · 8c65a61 · 8c65a61
2 parents 7b7ad2f + 58a4149
commit 8c65a61
Show file tree

Hide file tree

Showing 30 changed files with 3,859 additions and 105 deletions.
diff --git a/docs/api/csv.md b/docs/api/csv.md
@@ -0,0 +1 @@
+::: hsclient.hydroshare.CSVAggregation
diff --git a/docs/examples/Aggregation_Data_Object_Operations.ipynb b/docs/examples/Aggregation_Data_Object_Operations.ipynb
@@ -14,7 +14,8 @@
     " * Time series\n",
     " * Geographic feature\n",
     " * Geographic raster\n",
-    " * Multidimensional NetCDF"
+    " * Multidimensional NetCDF\n",
+    " * CSV"
    ],
    "metadata": {
     "collapsed": false
@@ -84,8 +85,9 @@
     "* Geographic feature : fiona.Collection\n",
     "* Geographic raster : rasterio.DatasetReader\n",
     "* Multidimensional NetCDF : xarray.Dataset\n",
+    "* CSV : pandas.DataFrame\n",
     "\n",
-    "In the following code examples, we are assuming that we have a resource in HydroShare that contains the above four aggregation types. All these aggregations are at the root of the resource. The resource id used in the following code examples is \"a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b\". You will need to change this resource id to the id of your resource in HydroShare.\n"
+    "In the following code examples, we are assuming that we have a resource in HydroShare that contains the above five aggregation types. All these aggregations are at the root of the resource. The resource id used in the following code examples is \"a0e0c2e2e5e84e1e9b6b2b2b2b2b2b2b\". You will need to change this resource id to the id of your resource in HydroShare.\n"
    ],
    "metadata": {
     "collapsed": false
@@ -936,6 +938,168 @@
    },
    "outputs": [],
    "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Loading CSV Data to pandas.DataFrame\n",
+    "Here we are assuming the CSV aggregation contains a CSV file with name \"sample.csv\" "
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# retrieve the CSV aggregation\n",
+    "file_path = \"sample.csv\"\n",
+    "csv_aggr = resource.aggregation(file__path=file_path)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# show the aggregation type\n",
+    "print(f\"Aggregation Type:{csv_aggr.metadata.type}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# download the CSV aggregation - these directory paths must exist for hsclient to download and unzip the aggregation zip file\n",
+    "# Note: These directory paths need to be changed based on where you want to download the aggregation\n",
+    "download_to = os.path.join(base_working_dir, \"csv_testing\")\n",
+    "unzip_to = os.path.join(download_to, \"aggr_unzipped\")\n",
+    "aggr_path = resource.aggregation_download(aggregation=csv_aggr, save_path=download_to, unzip_to=unzip_to)\n",
+    "print(f\"Downloaded aggregation to:{aggr_path}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# load the CSV aggregation as pandas.DataFrame\n",
+    "csv_df = csv_aggr.as_data_object(agg_path=aggr_path)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# show number of rows and columns\n",
+    "print(f\"Number of data rows:{len(csv_df)}\")\n",
+    "print(f\"Number of data columns:{len(csv_df.columns)}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# show the first 5 data rows\n",
+    "print(csv_df.head(5))"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# show the extracted CSV aggregation metadata (table schema)\n",
+    "table_schema = csv_aggr.metadata.tableSchema\n",
+    "table = table_schema.table\n",
+    "print(f\"Number of data rows:{table_schema.rows}\")\n",
+    "print(f\"Number of data columns:{len(table.columns)}\")\n",
+    "print(f\"Delimiter:{table_schema.delimiter}\")\n",
+    "\n",
+    "# show data column properties\n",
+    "for col in table.columns:\n",
+    "    print(f\"Column number:{col.column_number}\")\n",
+    "    print(f\"Column title:{col.title}\")\n",
+    "    print(f\"Column description:{col.description}\")\n",
+    "    print(f\"Column data type:{col.datatype}\")\n",
+    "    print(\"-\"*50) "
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "***Editing CSV aggregation using pandas.DataFrame***"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# drop the last data column - note all editing needs to be in 'inplace' mode\n",
+    "csv_df.drop(csv_df.columns[-1], axis=1, inplace=True)\n",
+    "# show the number of data columns after the edit\n",
+    "print(f\"Number of data columns after edit:{len(csv_df.columns)}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# save the updated CSV aggregation in Hydroshare\n",
+    "# Note this will overwrite the original aggregation - this operation may take a while\n",
+    "csv_aggr = csv_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=False)\n",
+    "print(\"Aggregation updated ...\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# we can also create a new CSV aggregation in HydroShare using the updated pandas.DataFrame object\n",
+    "# we first create a new folder in which the new aggregation will be created\n",
+    "aggr_folder = \"csv_folder\"\n",
+    "resource.folder_create(folder=aggr_folder)\n",
+    "\n",
+    "# this operation may take a while\n",
+    "csv_aggr = csv_aggr.save_data_object(resource=resource, agg_path=aggr_path, as_new_aggr=True, destination_path=aggr_folder)\n",
+    "print(\"New CSV aggregation was created ...\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "# retrieve the updated CSV aggregation to verify the data got updated\n",
+    "download_to = os.path.join(base_working_dir, \"csv_testing\")\n",
+    "\n",
+    "# note the unzip_to directory must exist and be empty\n",
+    "unzip_to = os.path.join(download_to, \"aggr_unzipped\")\n",
+    "aggr_path = resource.aggregation_download(aggregation=csv_aggr, save_path=download_to, unzip_to=unzip_to)\n",
+    "csv_df = csv_aggr.as_data_object(agg_path=aggr_path)\n",
+    "\n",
+    "# show the number of data rows and columns\n",
+    "print(f\"Number of data rows:{len(csv_df)}\")\n",
+    "print(f\"Number of data columns:{len(csv_df.columns)}\")\n",
+    "# show the first 5 data rows\n",
+    "print(csv_df.head(5))"
+   ],
+   "outputs": [],
+   "execution_count": null
   }
  ],
  "metadata": {