Skip to content

Commit

Permalink
Switch to Netflix data set (GoogleCloudPlatform#385)
Browse files Browse the repository at this point in the history
1. Use Netflix data set
2. Pined version
3. Add time counter

Tested-by: zlq
  • Loading branch information
blackzlq authored Mar 19, 2024
1 parent 1283506 commit 8e2fec9
Showing 1 changed file with 33 additions and 13 deletions.
46 changes: 33 additions & 13 deletions applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@
"os.environ['KAGGLE_KEY'] = KAGGLE_KEY\n",
"\n",
"# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n",
"!kaggle datasets download -d denizbilginn/google-maps-restaurant-reviews -p ~/data --force\n",
"!mkdir /persist-data/google-maps-restaurant-reviews -p\n",
"!unzip -o ~/data/google-maps-restaurant-reviews.zip -x \"dataset/*\" \"sepetcioglu_restaurant/*\" -d /persist-data/google-maps-restaurant-reviews"
"!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n",
"!mkdir /persist-data/netflix-shows -p\n",
"!unzip -o ~/data/netflix-shows.zip -d /persist-data/netflix-shows"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"id": "050f2c66-b92e-4ca6-a3b7-b7448d066f8e",
"metadata": {},
"outputs": [],
Expand All @@ -54,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"id": "c82cdcad-c74c-4196-9aa0-2e6bb49f4b58",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -115,8 +115,8 @@
"SENTENCE_TRANSFORMER_MODEL_PATH = SHARED_DATA_BASEPATH + '/' + SENTENCE_TRANSFORMER_MODEL_PATH_NAME + '/snapshots/' + SENTENCE_TRANSFORMER_MODEL_SNAPSHOT # the path where the model is downloaded one time\n",
"\n",
"# the dataset has been pre-dowloaded to the GCS bucket as part of the notebook in the cell above. Ray workers will find the dataset readily mounted.\n",
"SHARED_DATASET_BASE_PATH=\"/data/google-maps-restaurant-reviews/\"\n",
"REVIEWS_FILE_NAME=\"reviews.csv\"\n",
"SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n",
"REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n",
"\n",
"BATCH_SIZE = 100\n",
"CHUNK_SIZE = 1000 # text chunk sizes which will be converted to vector embeddings\n",
Expand Down Expand Up @@ -165,7 +165,13 @@
"print(ray_ds.schema)\n",
"\n",
"# Distributed flat map to extract the raw text fields.\n",
"ds_batch = ray_ds.flat_map(lambda row: [{'item': row[\"author_name\"] + \" posted a review with rating: \" + str(row[\"rating\"]) + \" with text: \" + row[\"text\"].replace(\"\\n\", \" \")}])\n",
"ds_batch = ray_ds.flat_map(lambda row: [{\n",
" 'item': \"This is a \" + str(row[\"type\"]) + \" in \" + str(row[\"country\"]) + \" called \" + str(row[\"title\"]) + \n",
" \" added at \" + str(row[\"date_added\"]) + \" whose director is \" + str(row[\"director\"]) + \n",
" \" and with cast: \" + str(row[\"cast\"]) + \" released at \" + str(row[\"release_year\"]) + \n",
" \". Its rating is: \" + str(row['rating']) + \". Its duration is \" + str(row[\"duration\"]) + \n",
" \". Its description is \" + str(row['description']) + \".\"\n",
"}])\n",
"print(ds_batch.schema)\n",
"\n",
"# Distributed map batches to create chunks out of each row, and fetch the vector embeddings by running inference on the sentence transformer\n",
Expand Down Expand Up @@ -248,7 +254,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"id": "aeeb7b7a-23d8-4c6a-8165-7ce5516d2a41",
"metadata": {},
"outputs": [],
Expand All @@ -260,26 +266,29 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"\n",
"start_time = time.time()\n",
"job_id = client.submit_job(\n",
" entrypoint=\"python test.py\",\n",
" # Path to the local directory that contains the entrypoint file.\n",
" runtime_env={\n",
" \"working_dir\": \"/home/jovyan/test\", # upload the local working directory to ray workers\n",
" \"pip\": [\n",
" \"langchain==0.1.9\",\n",
" \"transformers\",\n",
" \"transformers==4.38.1\",\n",
" \"sentence-transformers==2.5.1\",\n",
" \"pyarrow\",\n",
" \"datasets==2.18.0\",\n",
" \"torch==2.0.1\",\n",
" \"cloud-sql-python-connector[pg8000]==1.7.0\",\n",
" \"SQLAlchemy==2.0.7\",\n",
" \"huggingface_hub\",\n",
" \"huggingface_hub==0.21.3\",\n",
" ],\n",
" }\n",
")\n",
Expand All @@ -294,8 +303,19 @@
" prev_status = status\n",
" if status.is_terminal():\n",
" break\n",
" time.sleep(5)\n"
" time.sleep(1)\n",
"end_time = time.time()\n",
"job_duration = end_time - start_time\n",
"print(f\"Job completed in {job_duration} seconds.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "98ec6c2d-3295-4f67-9fa0-af6d5708955a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 8e2fec9

Please sign in to comment.