Switch to Netflix data set (GoogleCloudPlatform#385)

1. Use Netflix data set 2. Pined version 3. Add time counter Tested-by: zlq
yiyinglovecoding · Mar 19, 2024 · 8e2fec9 · 8e2fec9
1 parent 1283506
commit 8e2fec9
Showing 1 changed file with 33 additions and 13 deletions.
diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb
@@ -36,14 +36,14 @@
     "os.environ['KAGGLE_KEY'] = KAGGLE_KEY\n",
     "\n",
     "# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n",
-    "!kaggle datasets download -d denizbilginn/google-maps-restaurant-reviews -p ~/data --force\n",
-    "!mkdir /persist-data/google-maps-restaurant-reviews -p\n",
-    "!unzip -o ~/data/google-maps-restaurant-reviews.zip -x \"dataset/*\" \"sepetcioglu_restaurant/*\" -d /persist-data/google-maps-restaurant-reviews"
+    "!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n",
+    "!mkdir /persist-data/netflix-shows -p\n",
+    "!unzip -o ~/data/netflix-shows.zip -d /persist-data/netflix-shows"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "050f2c66-b92e-4ca6-a3b7-b7448d066f8e",
    "metadata": {},
    "outputs": [],
@@ -54,7 +54,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "id": "c82cdcad-c74c-4196-9aa0-2e6bb49f4b58",
    "metadata": {},
    "outputs": [],
@@ -115,8 +115,8 @@
     "SENTENCE_TRANSFORMER_MODEL_PATH = SHARED_DATA_BASEPATH + '/' + SENTENCE_TRANSFORMER_MODEL_PATH_NAME + '/snapshots/' + SENTENCE_TRANSFORMER_MODEL_SNAPSHOT # the path where the model is downloaded one time\n",
     "\n",
     "# the dataset has been pre-dowloaded to the GCS bucket as part of the notebook in the cell above. Ray workers will find the dataset readily mounted.\n",
-    "SHARED_DATASET_BASE_PATH=\"/data/google-maps-restaurant-reviews/\"\n",
-    "REVIEWS_FILE_NAME=\"reviews.csv\"\n",
+    "SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n",
+    "REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n",
     "\n",
     "BATCH_SIZE = 100\n",
     "CHUNK_SIZE = 1000 # text chunk sizes which will be converted to vector embeddings\n",
@@ -165,7 +165,13 @@
     "print(ray_ds.schema)\n",
     "\n",
     "# Distributed flat map to extract the raw text fields.\n",
-    "ds_batch = ray_ds.flat_map(lambda row: [{'item': row[\"author_name\"] + \" posted a review with rating: \" + str(row[\"rating\"]) + \" with text: \" + row[\"text\"].replace(\"\\n\", \" \")}])\n",
+    "ds_batch = ray_ds.flat_map(lambda row: [{\n",
+    "    'item': \"This is a \" + str(row[\"type\"]) + \" in \" + str(row[\"country\"]) + \" called \" + str(row[\"title\"]) + \n",
+    "    \" added at \" + str(row[\"date_added\"]) + \" whose director is \" + str(row[\"director\"]) + \n",
+    "    \" and with cast: \" + str(row[\"cast\"]) + \" released at \" + str(row[\"release_year\"]) + \n",
+    "    \". Its rating is: \" + str(row['rating']) + \". Its duration is \" + str(row[\"duration\"]) + \n",
+    "    \". Its description is \" + str(row['description']) + \".\"\n",
+    "}])\n",
     "print(ds_batch.schema)\n",
     "\n",
     "# Distributed map batches to create chunks out of each row, and fetch the vector embeddings by running inference on the sentence transformer\n",
@@ -248,7 +254,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "aeeb7b7a-23d8-4c6a-8165-7ce5516d2a41",
    "metadata": {},
    "outputs": [],
@@ -260,26 +266,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import time\n",
+    "\n",
+    "start_time = time.time()\n",
     "job_id = client.submit_job(\n",
     "    entrypoint=\"python test.py\",\n",
     "    # Path to the local directory that contains the entrypoint file.\n",
     "    runtime_env={\n",
     "        \"working_dir\": \"/home/jovyan/test\", # upload the local working directory to ray workers\n",
     "        \"pip\": [\n",
     "                \"langchain==0.1.9\",\n",
-    "                \"transformers\",\n",
+    "                \"transformers==4.38.1\",\n",
     "                \"sentence-transformers==2.5.1\",\n",
     "                \"pyarrow\",\n",
     "                \"datasets==2.18.0\",\n",
     "                \"torch==2.0.1\",\n",
     "                \"cloud-sql-python-connector[pg8000]==1.7.0\",\n",
     "                \"SQLAlchemy==2.0.7\",\n",
-    "                \"huggingface_hub\",\n",
+    "                \"huggingface_hub==0.21.3\",\n",
     "                ],\n",
     "    }\n",
     ")\n",
@@ -294,8 +303,19 @@
     "        prev_status = status\n",
     "    if status.is_terminal():\n",
     "        break\n",
-    "    time.sleep(5)\n"
+    "    time.sleep(1)\n",
+    "end_time = time.time()\n",
+    "job_duration = end_time - start_time\n",
+    "print(f\"Job  completed in {job_duration} seconds.\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "98ec6c2d-3295-4f67-9fa0-af6d5708955a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {