Added notebook impl

run-llama · Nov 20, 2023 · 6958fcc · 6958fcc
1 parent c1e3992
commit 6958fcc
Showing 1 changed file with 393 additions and 0 deletions.
diff --git a/llama_hub/tools/notebooks/waii.ipynb b/llama_hub/tools/notebooks/waii.ipynb
@@ -0,0 +1,393 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "7f0195c2-4a20-488e-8782-ca5a83488d0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_hub.tools.waii import WaiiToolSpec\n",
+    "\n",
+    "waii_tool = WaiiToolSpec(\n",
+    "    url=\"https://tweakit.waii.ai/api/\",\n",
+    "    # API Key of Waii (not OpenAI API key)\n",
+    "    api_key=\"3a44......\",\n",
+    "    # Which database you want to use, you need add the db connection to Waii first\n",
+    "    database_key=\"snowflake://...\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "0a79a9fa-e5ff-4242-99a2-08cc85e158a9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"The table 'COLUMNS' contains the most columns. The top 5 tables with the number of columns are 'COLUMNS' with 43 columns, 'TABLES' with 25 columns, and the remaining tables have fewer than 25 columns.\""
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from llama_index import VectorStoreIndex\n",
+    "\n",
+    "# Use as Data Loader, load data to index and query it\n",
+    "documents = waii_tool.load_data('Get all tables with their number of columns')\n",
+    "index = VectorStoreIndex.from_documents(documents).as_query_engine()\n",
+    "\n",
+    "index.query('Which table contains most columns, tell me top 5 tables with number of columns?').response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "b259d9cd-bbb8-4fff-a4ce-80fb0f3a1a10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Use as tool, initialize it\n",
+    "from llama_index.agent import OpenAIAgent\n",
+    "from llama_index.llms import OpenAI\n",
+    "\n",
+    "agent = OpenAIAgent.from_tools(waii_tool.to_tool_list(), llm=OpenAI(model='gpt-4-1106-preview'), verbose=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "094b7878-59d6-4f12-b357-4f0d254953da",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The top 3 countries with the highest number of car factories are Japan, Germany, and the USA.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Ask simple questions\n",
+    "print(agent.chat(\"Give me top 3 countries with the most number of car factory\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "233deb3d-547b-49a2-89a4-28fa1abea9dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Here are the car factories in the top 3 countries with the most number of car factories:\n",
+      "\n",
+      "- **Japan**: Nissan, Honda, Mazda, Subaru, and Toyota.\n",
+      "- **Germany**: Volkswagen, BMW, Daimler Benz, and Opel.\n",
+      "- **USA**: AMC, GM, Ford, and Chrysler.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(agent.chat(\"What are the car factories of these countries\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "90c2ba4d-6ac4-4cbb-93b0-e03a8d015042",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The top 3 longest running queries and their durations are:\n",
+      "\n",
+      "1. Query ID: 01b0683d-0001-e41d-0022-ba8700baf82a, with a duration of 61,034 milliseconds.\n",
+      "2. Query ID: 01b07576-0001-e5c4-0022-ba8700bc1d62, with a duration of 33,450 milliseconds.\n",
+      "3. Query ID: 01b06494-0001-e50e-0022-ba8700bad9a2, with a duration of 25,301 milliseconds.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Do performance analysis\n",
+    "print(agent.chat(\"Give me top 3 longest running queries, and their duration.\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "0706d166-d94f-40fc-adae-15b1379f501b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The analysis of the second-longest running query reveals the following:\n",
+      "\n",
+      "**Summary**:\n",
+      "- The most time-consuming part of the query is the table scan operator on the 'store_sales' table, which accounts for approximately 79% of the execution time. It scans a total of 3,064,958,976 bytes and emits 54,704,849 output rows. The columns scanned are 'ss_sold_date_sk', 'ss_customer_sk', 'ss_store_sk', 'ss_quantity', 'ss_sales_price', and 'ss_ext_discount_amt'.\n",
+      "\n",
+      "**Recommendations**:\n",
+      "1. It is suggested to join the 'store_sales' table with the 'date_dim' table on 'ss_sold_date_sk = d_date_sk' and apply a filter for 'd_year = 2001' during the initial table scan. This would reduce the number of rows processed and avoid scanning the entire 'store_sales' table.\n",
+      "2. The common table expressions (CTEs) 'sales_2001', 'monthly_spend_increase', and 'spend_increase_percentage' should be merged into a single query block using conditional aggregation and window functions. This would reduce the number of scans and intermediate result sets.\n",
+      "3. The 'ss_quantity' column should be removed from the GROUP BY clause in the 'sales_2001' CTE, as it is not necessary for the final result set and may be causing unnecessary row expansion.\n",
+      "\n",
+      "**Execution Details**:\n",
+      "- The execution time of the query was 33,068 milliseconds, with a compilation time of 404 milliseconds.\n",
+      "\n",
+      "The query text is quite complex, involving multiple CTEs and window functions. The recommendations provided aim to optimize the query by reducing the amount of data scanned and processed, and by simplifying the query structure.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(agent.chat(\"analyze the 2nd-longest running query\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "47530eba-24be-42d9-b1a0-fa1af28934f7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The difference between the two queries is as follows:\n",
+      "\n",
+      "1. The first query calculates the average salary for each department and the difference between each employee's salary and the department's average salary. It does not have a row limit.\n",
+      "\n",
+      "2. The second query calculates the maximum salary for each department instead of the average salary. It also includes the difference between each employee's salary and the department's average salary, similar to the first query. Additionally, this query has a LIMIT clause, restricting the results to the first 100 rows.\n",
+      "\n",
+      "In summary, the key differences are the use of MAX(salary) instead of AVG(salary) for the department salary comparison and the inclusion of a LIMIT clause in the second query.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Diff two queries\n",
+    "previous_query = \"\"\"\n",
+    "SELECT\n",
+    "    employee_id,\n",
+    "    department,\n",
+    "    salary,\n",
+    "    AVG(salary) OVER (PARTITION BY department) AS department_avg_salary,\n",
+    "    salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg\n",
+    "FROM\n",
+    "    employees;\n",
+    "\"\"\"\n",
+    "current_query = \"\"\"\n",
+    "SELECT\n",
+    "    employee_id,\n",
+    "    department,\n",
+    "    salary,\n",
+    "    MAX(salary) OVER (PARTITION BY department) AS department_max_salary,\n",
+    "    salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg\n",
+    "FROM\n",
+    "    employees;\n",
+    "LIMIT 100;\n",
+    "\"\"\"\n",
+    "print(agent.chat(f\"tell me difference between {previous_query} and {current_query}\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "7a222df4-d00e-4fe8-be8e-9efdb43f1462",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The database is a comprehensive collection of schemas covering various domains:\n",
+      "\n",
+      "- **FLIGHT**: Contains information about airlines, airports, and flight details.\n",
+      "- **STUDENT_TRANSCRIPTS_TRACKING**: Manages student records, enrollment, and academic information, including courses, degree programs, departments, and semesters.\n",
+      "- **WORLD**: Provides data on cities, countries, populations, and languages spoken.\n",
+      "- **INFORMATION_SCHEMA**: Offers metadata about the database objects within the WAII database, including roles, classes, columns, databases, functions, load history, privileges, procedures, constraints, schemata, sequences, and more.\n",
+      "- **EMPLOYEE_HIRE_EVALUATION**: Holds data on employee information, evaluations, hiring evaluations, and shop performance.\n",
+      "- **PETS**: Tracks the relationship between students and their pets, along with managing student records.\n",
+      "\n",
+      "Each schema is designed to cater to specific data management needs, ranging from flight operations to student and employee records, as well as pet ownership details.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Describe dataset\n",
+    "print(agent.chat(\"Summarize the dataset\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "aa425c72-e8b5-46b9-8e53-c648e9b00ee9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Here are some example questions you can ask about the different schemas within the dataset:\n",
+      "\n",
+      "**For the FLIGHT schema:**\n",
+      "- What are the busiest airports in terms of flight departures?\n",
+      "- Which airline operates the most international routes?\n",
+      "- How many flights are there between two specific cities?\n",
+      "\n",
+      "**For the STUDENT_TRANSCRIPTS_TRACKING schema:**\n",
+      "- Which students have the highest GPA in their respective departments?\n",
+      "- How many students are enrolled in each degree program?\n",
+      "- What is the average number of courses taken by students each semester?\n",
+      "\n",
+      "**For the WORLD schema:**\n",
+      "- Which country has the highest population density?\n",
+      "- What are the official languages spoken in a specific country?\n",
+      "- How many countries have a population greater than 100 million?\n",
+      "\n",
+      "**For the INFORMATION_SCHEMA schema:**\n",
+      "- What tables exist in a specific schema?\n",
+      "- Which columns in a table have constraints?\n",
+      "- How many stored procedures are defined in the database?\n",
+      "\n",
+      "**For the EMPLOYEE_HIRE_EVALUATION schema:**\n",
+      "- Which employees received the highest evaluation scores?\n",
+      "- How many shops have more than 10 employees?\n",
+      "- What is the average duration of employment across all employees?\n",
+      "\n",
+      "**For the PETS schema:**\n",
+      "- How many students own more than one pet?\n",
+      "- What is the most common type of pet among students?\n",
+      "- Are there any students who own exotic pets?\n",
+      "\n",
+      "These questions can help you explore and analyze the data within each schema of the database.\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(agent.chat(\"Give me questions which I can ask about this dataset\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "d6bdf837-241a-4637-a07e-a73fafd52a07",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The translated Snowflake SQL query from the given PySpark query is as follows:\n",
+      "\n",
+      "```sql\n",
+      "WITH yearly_avg_hp AS (\n",
+      "    SELECT\n",
+      "        year,\n",
+      "        AVG(horsepower) AS avg_horsepower\n",
+      "    FROM waii.car.cars_data\n",
+      "    GROUP BY\n",
+      "        year\n",
+      "),\n",
+      "\n",
+      "yearly_comparisons AS (\n",
+      "    SELECT\n",
+      "        year,\n",
+      "        avg_horsepower,\n",
+      "        LAG(avg_horsepower) OVER (ORDER BY year) AS prev_year_hp,\n",
+      "        LEAD(avg_horsepower) OVER (ORDER BY year) AS next_year_hp\n",
+      "    FROM yearly_avg_hp\n",
+      ")\n",
+      "\n",
+      "SELECT\n",
+      "    year,\n",
+      "    avg_horsepower,\n",
+      "    ROUND((\n",
+      "        (\n",
+      "            avg_horsepower - prev_year_hp\n",
+      "        ) / NULLIF(prev_year_hp, 0) * 100\n",
+      "    ), 2) AS percentage_diff_prev_year,\n",
+      "    ROUND((\n",
+      "        (\n",
+      "            next_year_hp - avg_horsepower\n",
+      "        ) / NULLIF(avg_horsepower, 0) * 100\n",
+      "    ), 2) AS percentage_diff_next_year\n",
+      "FROM yearly_comparisons\n",
+      "ORDER BY\n",
+      "    year\n",
+      "```\n",
+      "\n",
+      "This query calculates the average horsepower for each year, the previous year's average horsepower, and the next year's average horsepower. It then computes the percentage difference from the previous year and the next year for the average horsepower. The results are ordered by year.\n"
+     ]
+    }
+   ],
+   "source": [
+    "q = \"\"\"\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.functions import avg, lag, lead, round\n",
+    "from pyspark.sql.window import Window\n",
+    "\n",
+    "spark = SparkSession.builder.appName(\"yearly_car_analysis\").getOrCreate()\n",
+    "\n",
+    "yearly_avg_hp = cars_data.groupBy(\"year\").agg(avg(\"horsepower\").alias(\"avg_horsepower\"))\n",
+    "\n",
+    "windowSpec = Window.orderBy(\"year\")\n",
+    "\n",
+    "yearly_comparisons = yearly_avg_hp.select(\n",
+    "    \"year\",\n",
+    "    \"avg_horsepower\",\n",
+    "    lag(\"avg_horsepower\").over(windowSpec).alias(\"prev_year_hp\"),\n",
+    "    lead(\"avg_horsepower\").over(windowSpec).alias(\"next_year_hp\")\n",
+    ")\n",
+    "\n",
+    "final_result = yearly_comparisons.select(\n",
+    "    \"year\",\n",
+    "    \"avg_horsepower\",\n",
+    "    round(\n",
+    "        (yearly_comparisons.avg_horsepower - yearly_comparisons.prev_year_hp) / \n",
+    "        yearly_comparisons.prev_year_hp * 100, 2\n",
+    "    ).alias(\"percentage_diff_prev_year\"),\n",
+    "    round(\n",
+    "        (yearly_comparisons.next_year_hp - yearly_comparisons.avg_horsepower) / \n",
+    "        yearly_comparisons.avg_horsepower * 100, 2\n",
+    "    ).alias(\"percentage_diff_next_year\")\n",
+    ").orderBy(\"year\")\n",
+    "\n",
+    "final_result.show()\n",
+    "\"\"\"\n",
+    "print(agent.chat(f\"translate this pyspark query {q}, to Snowflake\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "myenv",
+   "language": "python",
+   "name": "myenv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}