diff --git a/llama_hub/tools/notebooks/waii.ipynb b/llama_hub/tools/notebooks/waii.ipynb index 3f2c382cdb..60e4006755 100644 --- a/llama_hub/tools/notebooks/waii.ipynb +++ b/llama_hub/tools/notebooks/waii.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "7f0195c2-4a20-488e-8782-ca5a83488d0d", "metadata": {}, "outputs": [], @@ -12,7 +12,7 @@ "waii_tool = WaiiToolSpec(\n", " url=\"https://tweakit.waii.ai/api/\",\n", " # API Key of Waii (not OpenAI API key)\n", - " api_key=\"3a44......\",\n", + " api_key=\"3a...\",\n", " # Which database you want to use, you need add the db connection to Waii first\n", " database_key=\"snowflake://...\"\n", ")" @@ -20,17 +20,17 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 2, "id": "0a79a9fa-e5ff-4242-99a2-08cc85e158a9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "\"The table 'COLUMNS' contains the most columns. The top 5 tables with the number of columns are 'COLUMNS' with 43 columns, 'TABLES' with 25 columns, and the remaining tables have fewer than 25 columns.\"" + "\"The table 'TABLES' contains the most columns. The top 5 tables with the number of columns are 'TABLES' with 25 columns, followed by 'SHOW' with 5 columns.\"" ] }, - "execution_count": 8, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 3, "id": "b259d9cd-bbb8-4fff-a4ce-80fb0f3a1a10", "metadata": {}, "outputs": [], @@ -56,12 +56,12 @@ "from llama_index.agent import OpenAIAgent\n", "from llama_index.llms import OpenAI\n", "\n", - "agent = OpenAIAgent.from_tools(waii_tool.to_tool_list(), llm=OpenAI(model='gpt-4-1106-preview'), verbose=False)" + "agent = OpenAIAgent.from_tools(waii_tool.to_tool_list(), llm=OpenAI(model='gpt-4-1106-preview'), verbose=True)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 4, "id": "094b7878-59d6-4f12-b357-4f0d254953da", "metadata": {}, "outputs": [ @@ -69,6 +69,17 @@ "name": "stdout", "output_type": "stream", "text": [ + "STARTING TURN 1\n", + "---------------\n", + "\n", + "=== Calling Function ===\n", + "Calling function: get_answer with args: {\"ask\":\"What are the top 3 countries with the highest number of car factories?\"}\n", + "Got output: Japan, Germany, and the USA.\n", + "========================\n", + "\n", + "STARTING TURN 2\n", + "---------------\n", + "\n", "The top 3 countries with the highest number of car factories are Japan, Germany, and the USA.\n" ] } @@ -80,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 5, "id": "233deb3d-547b-49a2-89a4-28fa1abea9dc", "metadata": {}, "outputs": [ @@ -88,11 +99,32 @@ "name": "stdout", "output_type": "stream", "text": [ - "Here are the car factories in the top 3 countries with the most number of car factories:\n", + "STARTING TURN 1\n", + "---------------\n", + "\n", + "=== Calling Function ===\n", + "Calling function: get_answer with args: {\"ask\": \"What are the car factories in Japan?\"}\n", + "Got output: Nissan Motors, Honda, Mazda, Subaru, and Toyota are car factories in Japan.\n", + "========================\n", + "\n", + "=== Calling Function ===\n", + "Calling function: get_answer with args: {\"ask\": \"What are the car factories in Germany?\"}\n", + "Got output: Volkswagen, BMW, Daimler Benz, and Opel are car factories in Germany.\n", + "========================\n", + "\n", + "=== Calling Function ===\n", + "Calling function: get_answer with args: {\"ask\": \"What are the car factories in the USA?\"}\n", + "Got output: amc, gm, ford, and chrysler are car factories in the USA.\n", + "========================\n", + "\n", + "STARTING TURN 2\n", + "---------------\n", + "\n", + "Here are some of the car factories in the top 3 countries with the most number of car factories:\n", "\n", - "- **Japan**: Nissan, Honda, Mazda, Subaru, and Toyota.\n", - "- **Germany**: Volkswagen, BMW, Daimler Benz, and Opel.\n", - "- **USA**: AMC, GM, Ford, and Chrysler.\n" + "- In Japan: Nissan Motors, Honda, Mazda, Subaru, and Toyota.\n", + "- In Germany: Volkswagen, BMW, Daimler Benz, and Opel.\n", + "- In the USA: AMC, GM, Ford, and Chrysler.\n" ] } ], @@ -102,7 +134,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 6, "id": "90c2ba4d-6ac4-4cbb-93b0-e03a8d015042", "metadata": {}, "outputs": [ @@ -110,11 +142,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "The top 3 longest running queries and their durations are:\n", + "STARTING TURN 1\n", + "---------------\n", "\n", - "1. Query ID: 01b0683d-0001-e41d-0022-ba8700baf82a, with a duration of 61,034 milliseconds.\n", - "2. Query ID: 01b07576-0001-e5c4-0022-ba8700bc1d62, with a duration of 33,450 milliseconds.\n", - "3. Query ID: 01b06494-0001-e50e-0022-ba8700bad9a2, with a duration of 25,301 milliseconds.\n" + "=== Calling Function ===\n", + "Calling function: get_answer with args: {\"ask\":\"What are the top 3 longest running queries and their durations?\"}\n", + "Got output: The top 3 longest running queries and their durations are as follows:\n", + "1. Query ID: 01b08971-0001-e7a0-0022-ba8700c28122, Duration: 365143 milliseconds\n", + "2. Query ID: 01b08aca-0001-e830-0022-ba8700c2a09e, Duration: 190413 milliseconds\n", + "3. Query ID: 01b08ac4-0001-e7ed-0022-ba8700c2951a, Duration: 170837 milliseconds\n", + "========================\n", + "\n", + "STARTING TURN 2\n", + "---------------\n", + "\n", + "The top 3 longest running queries and their respective durations are:\n", + "\n", + "1. Query ID: 01b08971-0001-e7a0-0022-ba8700c28122, with a duration of 365,143 milliseconds.\n", + "2. Query ID: 01b08aca-0001-e830-0022-ba8700c2a09e, with a duration of 190,413 milliseconds.\n", + "3. Query ID: 01b08ac4-0001-e7ed-0022-ba8700c2951a, with a duration of 170,837 milliseconds.\n" ] } ], @@ -125,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 7, "id": "0706d166-d94f-40fc-adae-15b1379f501b", "metadata": {}, "outputs": [ @@ -133,20 +179,65 @@ "name": "stdout", "output_type": "stream", "text": [ - "The analysis of the second-longest running query reveals the following:\n", + "STARTING TURN 1\n", + "---------------\n", + "\n", + "=== Calling Function ===\n", + "Calling function: performance_analyze with args: {\"query_uuid\":\"01b08aca-0001-e830-0022-ba8700c2a09e\"}\n", + "Got output: {\n", + " \"summary\": [\n", + " \"The 'aggregate' operator with functions sum(sum(sum(sum_internal(ss.ss_sales_price, count(*))))) and grouping keys i.i_category, i.i_class, i.i_brand, i.i_product_name, d.d_year, d.d_qoy, d.d_moy, s.s_store_id is the most time-consuming, with an overall percentage of 0.31 of execution time and a significant amount of bytes spilled to local storage (4774297600 bytes).\",\n", + " \"The 'windowfunction' operator with function rank() over (partition by i.i_category order by sum(sum(sum(sum_internal(ss.ss_sales_price, count(*))))) desc nulls first) also contributes to the high execution time with an overall percentage of 0.2 and a large amount of bytes spilled to local storage (15166328832 bytes).\",\n", + " \"The 'tablescan' operator on the store_sales table is scanning a large amount of data (3514269696 bytes) without utilizing cache, which indicates a potential area for optimization.\"\n", + " ],\n", + " \"recommendations\": [\n", + " \"Consider pushing down relevant filters to the 'tablescan' operator on the store_sales table to reduce the bytes scanned. This could be done by analyzing the join conditions and where clause to identify any filters that can be applied earlier in the query execution.\",\n", + " \"Optimize the 'aggregate' operator by reviewing the necessity of all grouping keys. If some keys are not essential for the final result, removing them could reduce the complexity of aggregation and the amount of data spilled to local storage.\",\n", + " \"For the 'windowfunction' operator, ensure that the input size is minimized by optimizing preceding operations, such as 'aggregate', to reduce the data volume before ranking. This could involve pushing down filters or optimizing the grouping strategy.\"\n", + " ],\n", + " \"query_text\": \"WITH sales_data AS ( SELECT i.i_category, i.i_class, i.i_brand, i.i_product_name, d.d_year, d.d_qoy, d.d_moy, s.s_store_id, SUM(ss.ss_sales_price) AS total_sales FROM tweakit_perf_db.sample_tpcds.store_sales AS ss INNER JOIN tweakit_perf_db.sample_tpcds.item AS i ON ss.ss_item_sk = i.i_item_sk INNER JOIN tweakit_perf_db.sample_tpcds.date_dim AS d ON ss.ss_sold_date_sk = d.d_date_sk INNER JOIN tweakit_perf_db.sample_tpcds.store AS s ON ss.ss_store_sk = s.s_store_sk WHERE d.d_year = 2000 GROUP BY i.i_category, i.i_class, i.i_brand, i.i_product_name, d.d_year, d.d_qoy, d.d_moy, s.s_store_id ), ranked_data AS ( SELECT *, RANK() OVER (PARTITION BY i_category ORDER BY total_sales DESC) AS rank FROM sales_data ) SELECT * FROM ranked_data WHERE rank <= 100 ORDER BY i_category, i_class, i_brand, i_product_name, d_year, d_qoy, d_moy, s_store_id, total_sales, rank limit 100\",\n", + " \"execution_time_ms\": 190094,\n", + " \"compilation_time_ms\": 341\n", + "}\n", + "========================\n", + "\n", + "STARTING TURN 2\n", + "---------------\n", + "\n", + "The analysis of the 2nd-longest running query reveals the following insights:\n", "\n", - "**Summary**:\n", - "- The most time-consuming part of the query is the table scan operator on the 'store_sales' table, which accounts for approximately 79% of the execution time. It scans a total of 3,064,958,976 bytes and emits 54,704,849 output rows. The columns scanned are 'ss_sold_date_sk', 'ss_customer_sk', 'ss_store_sk', 'ss_quantity', 'ss_sales_price', and 'ss_ext_discount_amt'.\n", + "- The most time-consuming operator is the 'aggregate' operator, which uses functions like `sum()` and grouping keys such as `i.i_category`, `i.i_class`, `i.i_brand`, `i.i_product_name`, `d.d_year`, `d.d_qoy`, `d.d_moy`, and `s.s_store_id`. It accounts for 0.31 of the execution time and has spilled a significant amount of bytes (4,774,297,600 bytes) to local storage.\n", + "- The 'windowfunction' operator with the `rank()` function also contributes to the high execution time, with an overall percentage of 0.2 and a large amount of bytes spilled to local storage (15,166,328,832 bytes).\n", + "- The 'tablescan' operator on the `store_sales` table is scanning a large amount of data (3,514,269,696 bytes) without utilizing cache, indicating a potential area for optimization.\n", "\n", - "**Recommendations**:\n", - "1. It is suggested to join the 'store_sales' table with the 'date_dim' table on 'ss_sold_date_sk = d_date_sk' and apply a filter for 'd_year = 2001' during the initial table scan. This would reduce the number of rows processed and avoid scanning the entire 'store_sales' table.\n", - "2. The common table expressions (CTEs) 'sales_2001', 'monthly_spend_increase', and 'spend_increase_percentage' should be merged into a single query block using conditional aggregation and window functions. This would reduce the number of scans and intermediate result sets.\n", - "3. The 'ss_quantity' column should be removed from the GROUP BY clause in the 'sales_2001' CTE, as it is not necessary for the final result set and may be causing unnecessary row expansion.\n", + "Based on these findings, the recommendations for optimization are:\n", "\n", - "**Execution Details**:\n", - "- The execution time of the query was 33,068 milliseconds, with a compilation time of 404 milliseconds.\n", + "1. Push down relevant filters to the 'tablescan' operator on the `store_sales` table to reduce the bytes scanned. This could be achieved by analyzing the join conditions and where clause to identify any filters that can be applied earlier in the query execution.\n", + "2. Optimize the 'aggregate' operator by reviewing the necessity of all grouping keys. Removing non-essential keys could reduce the complexity of aggregation and the amount of data spilled to local storage.\n", + "3. Minimize the input size for the 'windowfunction' operator by optimizing preceding operations, such as 'aggregate', to reduce the data volume before ranking. This could involve pushing down filters or optimizing the grouping strategy.\n", "\n", - "The query text is quite complex, involving multiple CTEs and window functions. The recommendations provided aim to optimize the query by reducing the amount of data scanned and processed, and by simplifying the query structure.\n" + "The query text is as follows:\n", + "```sql\n", + "WITH sales_data AS (\n", + " SELECT i.i_category, i.i_class, i.i_brand, i.i_product_name, d.d_year, d.d_qoy, d.d_moy, s.s_store_id, SUM(ss.ss_sales_price) AS total_sales\n", + " FROM tweakit_perf_db.sample_tpcds.store_sales AS ss\n", + " INNER JOIN tweakit_perf_db.sample_tpcds.item AS i ON ss.ss_item_sk = i.i_item_sk\n", + " INNER JOIN tweakit_perf_db.sample_tpcds.date_dim AS d ON ss.ss_sold_date_sk = d.d_date_sk\n", + " INNER JOIN tweakit_perf_db.sample_tpcds.store AS s ON ss.ss_store_sk = s.s_store_sk\n", + " WHERE d.d_year = 2000\n", + " GROUP BY i.i_category, i.i_class, i.i_brand, i.i_product_name, d.d_year, d.d_qoy, d.d_moy, s.s_store_id\n", + "), ranked_data AS (\n", + " SELECT *, RANK() OVER (PARTITION BY i_category ORDER BY total_sales DESC) AS rank\n", + " FROM sales_data\n", + ")\n", + "SELECT *\n", + "FROM ranked_data\n", + "WHERE rank <= 100\n", + "ORDER BY i_category, i_class, i_brand, i_product_name, d_year, d_qoy, d_moy, s_store_id, total_sales, rank\n", + "LIMIT 100\n", + "```\n", + "\n", + "The execution time of the query was 190,094 milliseconds, and the compilation time was 341 milliseconds.\n" ] } ], @@ -156,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 8, "id": "47530eba-24be-42d9-b1a0-fa1af28934f7", "metadata": {}, "outputs": [ @@ -164,13 +255,28 @@ "name": "stdout", "output_type": "stream", "text": [ + "STARTING TURN 1\n", + "---------------\n", + "\n", + "=== Calling Function ===\n", + "Calling function: diff_query with args: {\"previous_query\":\"SELECT\\n employee_id,\\n department,\\n salary,\\n AVG(salary) OVER (PARTITION BY department) AS department_avg_salary,\\n salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg\\nFROM\\n employees;\",\"current_query\":\"SELECT\\n employee_id,\\n department,\\n salary,\\n MAX(salary) OVER (PARTITION BY department) AS department_max_salary,\\n salary - AVG(salary) OVER (PARTITION BY department) AS diff_from_avg\\nFROM\\n employees;\\nLIMIT 100;\"}\n", + "Got output: The query retrieves the employee ID, department, salary, maximum salary within the department, and the difference between the employee's salary and the average salary within the department. The result set is limited to 100 rows.\n", + "========================\n", + "\n", + "STARTING TURN 2\n", + "---------------\n", + "\n", "The difference between the two queries is as follows:\n", "\n", - "1. The first query calculates the average salary for each department and the difference between each employee's salary and the department's average salary. It does not have a row limit.\n", + "1. In the first query, the fourth column calculates the average salary for each department (`department_avg_salary`) using the `AVG()` window function partitioned by the department.\n", "\n", - "2. The second query calculates the maximum salary for each department instead of the average salary. It also includes the difference between each employee's salary and the department's average salary, similar to the first query. Additionally, this query has a LIMIT clause, restricting the results to the first 100 rows.\n", + "2. In the second query, the fourth column calculates the maximum salary for each department (`department_max_salary`) using the `MAX()` window function partitioned by the department.\n", "\n", - "In summary, the key differences are the use of MAX(salary) instead of AVG(salary) for the department salary comparison and the inclusion of a LIMIT clause in the second query.\n" + "Additionally, the second query has a `LIMIT 100` clause at the end, which restricts the result set to the first 100 rows.\n", + "\n", + "Here's a summary of the differences:\n", + "- The first query calculates the average salary per department, while the second query calculates the maximum salary per department.\n", + "- The second query limits the result set to 100 rows, whereas the first query does not impose any limit on the number of rows returned.\n" ] } ], @@ -202,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 9, "id": "7a222df4-d00e-4fe8-be8e-9efdb43f1462", "metadata": {}, "outputs": [ @@ -210,16 +316,31 @@ "name": "stdout", "output_type": "stream", "text": [ - "The database is a comprehensive collection of schemas covering various domains:\n", + "STARTING TURN 1\n", + "---------------\n", + "\n", + "=== Calling Function ===\n", + "Calling function: describe_dataset with args: {\"ask\":\"Can you summarize the dataset for me?\"}\n", + "Got output: The dataset consists of multiple schemas, each containing tables related to different domains. The schemas include \"FLIGHT\" which contains tables related to airlines, airports, and flights, \"STUDENT_TRANSCRIPTS_TRACKING\" which contains tables related to student information, courses, degree programs, departments, semesters, and transcripts, \"WORLD\" which contains tables related to cities, countries, and languages, \"INFORMATION_SCHEMA\" which provides information about the database objects and metadata in the WAII database, \"EMPLOYEE_HIRE_EVALUATION\" which contains tables related to employee information, evaluations, hiring evaluations, and shop details, and \"PETS\" which contains tables related to pet ownership and student information.\n", + "\n", + "Each schema has a summary describing its purpose and the type of information it contains. Additionally, there are common questions associated with each schema, which provide insights into the types of queries that can be answered using the dataset. The common tables within each schema are also described, providing information about the specific tables and their purpose within the schema.\n", "\n", - "- **FLIGHT**: Contains information about airlines, airports, and flight details.\n", - "- **STUDENT_TRANSCRIPTS_TRACKING**: Manages student records, enrollment, and academic information, including courses, degree programs, departments, and semesters.\n", - "- **WORLD**: Provides data on cities, countries, populations, and languages spoken.\n", - "- **INFORMATION_SCHEMA**: Offers metadata about the database objects within the WAII database, including roles, classes, columns, databases, functions, load history, privileges, procedures, constraints, schemata, sequences, and more.\n", - "- **EMPLOYEE_HIRE_EVALUATION**: Holds data on employee information, evaluations, hiring evaluations, and shop performance.\n", - "- **PETS**: Tracks the relationship between students and their pets, along with managing student records.\n", + "Overall, the dataset covers a wide range of domains including airlines, airports, flights, student records, courses, degrees, departments, cities, countries, languages, database metadata, employee information, evaluations, hiring evaluations, shop details, pet ownership, and student information. It can be used to analyze and generate reports on various aspects of these domains.\n", + "========================\n", "\n", - "Each schema is designed to cater to specific data management needs, ranging from flight operations to student and employee records, as well as pet ownership details.\n" + "STARTING TURN 2\n", + "---------------\n", + "\n", + "The dataset is a comprehensive collection of information spread across multiple schemas, each dedicated to a specific domain:\n", + "\n", + "- **FLIGHT**: This schema includes tables related to airlines, airports, and flights, providing data that could be used for analyzing air travel and operations.\n", + "- **STUDENT_TRANSCRIPTS_TRACKING**: This schema contains tables with student information, courses, degree programs, departments, semesters, and transcripts, which can be used for educational administration and tracking academic progress.\n", + "- **WORLD**: This schema features tables related to cities, countries, and languages, offering a dataset for geographical and linguistic analysis.\n", + "- **INFORMATION_SCHEMA**: This schema provides metadata about the database objects within the WAII database, useful for database management and schema exploration.\n", + "- **EMPLOYEE_HIRE_EVALUATION**: This schema includes tables related to employee information, evaluations, hiring evaluations, and shop details, which can be used for human resources management and performance analysis.\n", + "- **PETS**: This schema contains tables related to pet ownership and student information, which could be used for studies on pet ownership demographics and related student activities.\n", + "\n", + "Each schema is accompanied by a summary that outlines its purpose and the nature of the data it holds. There are also common questions associated with each schema to guide users in understanding the types of queries that can be performed. The dataset is versatile and can be utilized to conduct analyses and generate reports across various fields such as air travel, education, geography, database management, human resources, and pet ownership studies.\n" ] } ], @@ -230,57 +351,7 @@ }, { "cell_type": "code", - "execution_count": 30, - "id": "aa425c72-e8b5-46b9-8e53-c648e9b00ee9", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Here are some example questions you can ask about the different schemas within the dataset:\n", - "\n", - "**For the FLIGHT schema:**\n", - "- What are the busiest airports in terms of flight departures?\n", - "- Which airline operates the most international routes?\n", - "- How many flights are there between two specific cities?\n", - "\n", - "**For the STUDENT_TRANSCRIPTS_TRACKING schema:**\n", - "- Which students have the highest GPA in their respective departments?\n", - "- How many students are enrolled in each degree program?\n", - "- What is the average number of courses taken by students each semester?\n", - "\n", - "**For the WORLD schema:**\n", - "- Which country has the highest population density?\n", - "- What are the official languages spoken in a specific country?\n", - "- How many countries have a population greater than 100 million?\n", - "\n", - "**For the INFORMATION_SCHEMA schema:**\n", - "- What tables exist in a specific schema?\n", - "- Which columns in a table have constraints?\n", - "- How many stored procedures are defined in the database?\n", - "\n", - "**For the EMPLOYEE_HIRE_EVALUATION schema:**\n", - "- Which employees received the highest evaluation scores?\n", - "- How many shops have more than 10 employees?\n", - "- What is the average duration of employment across all employees?\n", - "\n", - "**For the PETS schema:**\n", - "- How many students own more than one pet?\n", - "- What is the most common type of pet among students?\n", - "- Are there any students who own exotic pets?\n", - "\n", - "These questions can help you explore and analyze the data within each schema of the database.\n" - ] - } - ], - "source": [ - "print(agent.chat(\"Give me questions which I can ask about this dataset\"))" - ] - }, - { - "cell_type": "code", - "execution_count": 31, + "execution_count": 11, "id": "d6bdf837-241a-4637-a07e-a73fafd52a07", "metadata": {}, "outputs": [ @@ -288,7 +359,52 @@ "name": "stdout", "output_type": "stream", "text": [ - "The translated Snowflake SQL query from the given PySpark query is as follows:\n", + "STARTING TURN 1\n", + "---------------\n", + "\n", + "=== Calling Function ===\n", + "Calling function: transcode with args: {\"instruction\":\"Translate this PySpark query to Snowflake SQL.\",\"source_dialect\":\"pyspark\",\"source_query\":\"from pyspark.sql import SparkSession\\nfrom pyspark.sql.functions import avg, lag, lead, round\\nfrom pyspark.sql.window import Window\\n\\nspark = SparkSession.builder.appName(\\\"yearly_car_analysis\\\").getOrCreate()\\n\\nyearly_avg_hp = cars_data.groupBy(\\\"year\\\").agg(avg(\\\"horsepower\\\").alias(\\\"avg_horsepower\\\"))\\n\\nwindowSpec = Window.orderBy(\\\"year\\\")\\n\\nyearly_comparisons = yearly_avg_hp.select(\\n \\\"year\\\",\\n \\\"avg_horsepower\\\",\\n lag(\\\"avg_horsepower\\\").over(windowSpec).alias(\\\"prev_year_hp\\\"),\\n lead(\\\"avg_horsepower\\\").over(windowSpec).alias(\\\"next_year_hp\\\")\\n)\\n\\nfinal_result = yearly_comparisons.select(\\n \\\"year\\\",\\n \\\"avg_horsepower\\\",\\n round(\\n (yearly_comparisons.avg_horsepower - yearly_comparisons.prev_year_hp) / \\n yearly_comparisons.prev_year_hp * 100, 2\\n ).alias(\\\"percentage_diff_prev_year\\\"),\\n round(\\n (yearly_comparisons.next_year_hp - yearly_comparisons.avg_horsepower) / \\n yearly_comparisons.avg_horsepower * 100, 2\\n ).alias(\\\"percentage_diff_next_year\\\")\\n).orderBy(\\\"year\\\")\\n\\nfinal_result.show()\",\"target_dialect\":\"snowflake\"}\n", + "Got output: WITH yearly_avg_hp AS (\n", + " SELECT\n", + " year,\n", + " AVG(horsepower) AS avg_horsepower\n", + " FROM waii.car.cars_data\n", + " GROUP BY\n", + " year\n", + "),\n", + "\n", + "yearly_comparisons AS (\n", + " SELECT\n", + " year,\n", + " avg_horsepower,\n", + " LAG(avg_horsepower) OVER (ORDER BY year) AS prev_year_hp,\n", + " LEAD(avg_horsepower) OVER (ORDER BY year) AS next_year_hp\n", + " FROM yearly_avg_hp\n", + ")\n", + "\n", + "SELECT\n", + " year,\n", + " avg_horsepower,\n", + " ROUND((\n", + " (\n", + " avg_horsepower - prev_year_hp\n", + " ) / NULLIF(prev_year_hp, 0) * 100\n", + " ), 2) AS percentage_diff_prev_year,\n", + " ROUND((\n", + " (\n", + " next_year_hp - avg_horsepower\n", + " ) / NULLIF(avg_horsepower, 0) * 100\n", + " ), 2) AS percentage_diff_next_year\n", + "FROM yearly_comparisons\n", + "ORDER BY\n", + " year\n", + "\n", + "========================\n", + "\n", + "STARTING TURN 2\n", + "---------------\n", + "\n", + "The translated Snowflake SQL query from the provided PySpark code is as follows:\n", "\n", "```sql\n", "WITH yearly_avg_hp AS (\n", @@ -327,7 +443,7 @@ " year\n", "```\n", "\n", - "This query calculates the average horsepower for each year, the previous year's average horsepower, and the next year's average horsepower. It then computes the percentage difference from the previous year and the next year for the average horsepower. The results are ordered by year.\n" + "This query calculates the average horsepower (`avg_horsepower`) for each year from the `cars_data` table, then uses window functions to get the previous and next year's average horsepower. It finally computes the percentage difference from the previous and next year's average horsepower and orders the result by year.\n" ] } ], @@ -367,6 +483,14 @@ "\"\"\"\n", "print(agent.chat(f\"translate this pyspark query {q}, to Snowflake\"))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e94a9d08-1562-47c1-8983-77270bcca21e", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {