From cdb52fc85537805a34a1fd75dabb96bfb952afa2 Mon Sep 17 00:00:00 2001 From: CI User Date: Thu, 4 Jan 2024 20:05:56 +0000 Subject: [PATCH] Update documentation --- docusaurus/404.html | 4 +- docusaurus/CLA/index.html | 4 +- docusaurus/assets/js/8439a13f.c361c36e.js | 1 + docusaurus/assets/js/8439a13f.c6e535b4.js | 1 - docusaurus/assets/js/beadeff4.2e3dfba8.js | 1 + docusaurus/assets/js/beadeff4.63d91510.js | 1 - docusaurus/assets/js/e506623f.83c9d7bd.js | 1 - docusaurus/assets/js/e506623f.9ae97683.js | 1 + ...n.4f37edbd.js => runtime~main.68f5e22c.js} | 2 +- docusaurus/blog/archive/index.html | 4 +- docusaurus/blog/atom.xml | 31 ++++++++------- docusaurus/blog/factorization/index.html | 4 +- docusaurus/blog/iamgraphviz/index.html | 4 +- docusaurus/blog/index.html | 37 +++++++++--------- .../blog/kuzu-0.0.10-release/index.html | 4 +- .../blog/kuzu-0.0.11-release/index.html | 4 +- .../blog/kuzu-0.0.12-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.2-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.3-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.4-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.5-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.6-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.7-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.8-release/index.html | 4 +- docusaurus/blog/kuzu-0.0.9-release/index.html | 4 +- docusaurus/blog/kuzu-0.1.0-release/index.html | 4 +- .../blog/kuzu-pyg-remote-backend/index.html | 4 +- docusaurus/blog/kuzuexplorer/index.html | 4 +- docusaurus/blog/llms-graphs-part-1/index.html | 39 ++++++++++--------- docusaurus/blog/meet-kuzu/index.html | 4 +- docusaurus/blog/page/2/index.html | 4 +- docusaurus/blog/page/3/index.html | 4 +- docusaurus/blog/page/4/index.html | 4 +- docusaurus/blog/rss.xml | 31 ++++++++------- docusaurus/blog/tags/index.html | 4 +- docusaurus/blog/tags/internals/index.html | 4 +- docusaurus/blog/tags/release/index.html | 4 +- .../blog/tags/release/page/2/index.html | 4 +- .../blog/tags/release/page/3/index.html | 4 +- docusaurus/blog/tags/use-case/index.html | 37 +++++++++--------- docusaurus/blog/tags/vision/index.html | 4 +- docusaurus/blog/wcoj/index.html | 4 +- .../index.html | 4 +- docusaurus/category/query-clauses/index.html | 4 +- docusaurus/client-apis/c/index.html | 4 +- docusaurus/client-apis/cli/index.html | 4 +- docusaurus/client-apis/cpp-api/index.html | 4 +- docusaurus/client-apis/cpp-api/udf/index.html | 4 +- .../client-apis/external/NET/index.html | 4 +- docusaurus/client-apis/index.html | 4 +- docusaurus/client-apis/java/index.html | 4 +- docusaurus/client-apis/nodejs/index.html | 4 +- docusaurus/client-apis/python/index.html | 4 +- docusaurus/client-apis/rust/index.html | 4 +- docusaurus/cypher/configuration/index.html | 4 +- docusaurus/cypher/copy/index.html | 4 +- .../cypher/data-definition/alter/index.html | 4 +- .../data-definition/create-table/index.html | 4 +- .../cypher/data-definition/drop/index.html | 4 +- docusaurus/cypher/data-definition/index.html | 4 +- .../create/index.html | 4 +- .../delete/index.html | 4 +- .../data-manipulation-clauses/index.html | 4 +- .../merge/index.html | 4 +- .../read-after-update/index.html | 4 +- .../data-manipulation-clauses/set/index.html | 4 +- docusaurus/cypher/data-types/blob/index.html | 4 +- docusaurus/cypher/data-types/date/index.html | 4 +- docusaurus/cypher/data-types/index.html | 4 +- .../cypher/data-types/interval/index.html | 4 +- docusaurus/cypher/data-types/list/index.html | 4 +- docusaurus/cypher/data-types/map/index.html | 4 +- docusaurus/cypher/data-types/node/index.html | 4 +- docusaurus/cypher/data-types/null/index.html | 4 +- docusaurus/cypher/data-types/path/index.html | 4 +- docusaurus/cypher/data-types/rel/index.html | 4 +- .../cypher/data-types/serial/index.html | 4 +- .../cypher/data-types/string/index.html | 4 +- .../cypher/data-types/struct/index.html | 4 +- .../cypher/data-types/timestamp/index.html | 4 +- docusaurus/cypher/data-types/union/index.html | 4 +- .../aggregate-functions/index.html | 4 +- .../expressions/blob-functions/index.html | 4 +- .../expressions/case-expression/index.html | 4 +- .../cypher/expressions/casting/index.html | 4 +- .../comparison-operators/index.html | 4 +- .../expressions/date-functions/index.html | 4 +- docusaurus/cypher/expressions/index.html | 4 +- .../expressions/interval-functions/index.html | 4 +- .../expressions/list-functions/index.html | 4 +- .../expressions/logical-operators/index.html | 4 +- .../expressions/map-functions/index.html | 4 +- .../expressions/node-rel-functions/index.html | 4 +- .../expressions/null-operators/index.html | 4 +- .../expressions/numeric-functions/index.html | 4 +- .../expressions/path_functions/index.html | 4 +- .../expressions/pattern-matching/index.html | 4 +- .../expressions/struct-functions/index.html | 4 +- .../expressions/text-functions/index.html | 4 +- .../timestamp-functions/index.html | 4 +- .../expressions/union-functions/index.html | 4 +- docusaurus/cypher/index.html | 4 +- docusaurus/cypher/macro/index.html | 4 +- .../cypher/query-clauses/call/index.html | 4 +- .../query-clauses/example-database/index.html | 4 +- .../cypher/query-clauses/limit/index.html | 4 +- .../cypher/query-clauses/load_from/index.html | 4 +- .../cypher/query-clauses/match/index.html | 4 +- .../query-clauses/optional-match/index.html | 4 +- .../cypher/query-clauses/order-by/index.html | 4 +- .../cypher/query-clauses/return/index.html | 4 +- .../cypher/query-clauses/skip/index.html | 4 +- .../cypher/query-clauses/union/index.html | 4 +- .../cypher/query-clauses/unwind/index.html | 4 +- .../cypher/query-clauses/where/index.html | 4 +- .../cypher/query-clauses/with/index.html | 4 +- docusaurus/cypher/subquery/index.html | 4 +- docusaurus/cypher/transaction/index.html | 4 +- docusaurus/data-export/csv-export/index.html | 4 +- docusaurus/data-export/index.html | 4 +- .../data-export/parquet-export/index.html | 4 +- docusaurus/data-import/csv-import/index.html | 4 +- docusaurus/data-import/index.html | 4 +- docusaurus/data-import/npy-import/index.html | 4 +- .../data-import/parquet-import/index.html | 4 +- .../development/building-kuzu/index.html | 4 +- .../database-internal/datatype/index.html | 4 +- .../database-internal/execution/index.html | 4 +- .../development/database-internal/index.html | 4 +- .../database-internal/vector/index.html | 4 +- .../performance-debugging/index.html | 4 +- .../development/testing-framework/index.html | 4 +- docusaurus/getting-started/c/index.html | 4 +- docusaurus/getting-started/cli/index.html | 4 +- docusaurus/getting-started/cpp/index.html | 4 +- docusaurus/getting-started/index.html | 4 +- docusaurus/getting-started/java/index.html | 4 +- docusaurus/getting-started/nodejs/index.html | 4 +- docusaurus/getting-started/os/index.html | 4 +- docusaurus/getting-started/python/index.html | 4 +- docusaurus/getting-started/rust/index.html | 4 +- docusaurus/index.html | 4 +- docusaurus/installation/index.html | 4 +- docusaurus/kuzuexplorer/index.html | 4 +- .../kuzuexplorer/schema-panel/index.html | 4 +- .../kuzuexplorer/settings-panel/index.html | 4 +- .../kuzuexplorer/shell-panel/index.html | 4 +- docusaurus/markdown-page/index.html | 4 +- docusaurus/search/index.html | 4 +- 149 files changed, 368 insertions(+), 363 deletions(-) create mode 100644 docusaurus/assets/js/8439a13f.c361c36e.js delete mode 100644 docusaurus/assets/js/8439a13f.c6e535b4.js create mode 100644 docusaurus/assets/js/beadeff4.2e3dfba8.js delete mode 100644 docusaurus/assets/js/beadeff4.63d91510.js delete mode 100644 docusaurus/assets/js/e506623f.83c9d7bd.js create mode 100644 docusaurus/assets/js/e506623f.9ae97683.js rename docusaurus/assets/js/{runtime~main.4f37edbd.js => runtime~main.68f5e22c.js} (98%) diff --git a/docusaurus/404.html b/docusaurus/404.html index 7caa1e847..b6733613f 100644 --- a/docusaurus/404.html +++ b/docusaurus/404.html @@ -11,7 +11,7 @@ - + @@ -33,7 +33,7 @@

Page Not Found

We could not find what you were looking for.

Please contact the owner of the site that linked you to the original URL and let them know their link is broken.

- + \ No newline at end of file diff --git a/docusaurus/CLA/index.html b/docusaurus/CLA/index.html index dd90ab061..ebc7fc7b6 100644 --- a/docusaurus/CLA/index.html +++ b/docusaurus/CLA/index.html @@ -11,7 +11,7 @@ - + @@ -33,7 +33,7 @@

Contributor Agreement

Thank you for your interest in contributing to Xiyang Feng's, Guodong Jin's, Chang Liu's, Ziyi Chen's, and Semih Salihoğlu's Kùzu ("We" or "Us").

The purpose of this contributor agreement ("Agreement") is to clarify and document the rights granted by contributors to Us.

How to use this Contributor Agreement

If You are an employee and have created the Contribution as part of your employment, You need to have Your employer approve this Agreement. If You do not own the Copyright in the entire work of authorship, any other author of the Contribution should also sign this – in any event, please contact Us at contact@kuzudb.com

1. Definitions

"You" means the individual Copyright owner who Submits a Contribution to Us.

"Legal Entity" means an entity that is not a natural person.

"Affiliate" means any other Legal Entity that controls, is controlled by, or under common control with that Legal Entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such Legal Entity, whether by contract or otherwise, (ii) ownership of fifty percent (50%) or more of the outstanding shares or securities that vote to elect the management or other persons who direct such Legal Entity or (iii) beneficial ownership of such entity.

"Contribution" means any original work of authorship, including any original modifications or additions to an existing work of authorship, Submitted by You to Us, in which You own the Copyright.

"Copyright" means all rights protecting works of authorship, including copyright, moral and neighboring rights, as appropriate, for the full term of their existence.

"Material" means the software or documentation made available by Us to third parties. When this Agreement covers more than one software project, the Material means the software or documentation to which the Contribution was Submitted. After You Submit the Contribution, it may be included in the Material.

"Submit" means any act by which a Contribution is transferred to Us by You by means of tangible or intangible media, including but not limited to electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, Us, but excluding any transfer that is conspicuously marked or otherwise designated in writing by You as "Not a Contribution."

"Documentation" means any non-software portion of a Contribution.

2. License grant

Subject to the terms and conditions of this Agreement, You hereby grant to Us a worldwide, royalty-free, Exclusive, perpetual and irrevocable (except as stated in Section 8.1) license, with the right to transfer an unlimited number of non-exclusive licenses or to grant sublicenses to third parties, under the Copyright covering the Contribution to use the Contribution by all means, including, but not limited to:

  • publish the Contribution,
  • modify the Contribution,
  • prepare derivative works based upon or containing the Contribution and/or to combine the Contribution with other Materials,
  • reproduce the Contribution in original or modified form,
  • distribute, to make the Contribution available to the public, display and publicly perform the Contribution in original or modified form.

2.2 Moral rights

Moral Rights remain unaffected to the extent they are recognized and not waivable by applicable law. Notwithstanding, You may add your name to the attribution mechanism customary used in the Materials you Contribute to, such as the header of the source code files of Your Contribution, and We will respect this attribution when using Your Contribution.

Upon such grant of rights to Us, We immediately grant to You a worldwide, royalty-free, non-exclusive, perpetual and irrevocable license, with the right to transfer an unlimited number of non-exclusive licenses or to grant sublicenses to third parties, under the Copyright covering the Contribution to use the Contribution by all means, including, but not limited to:

  • publish the Contribution,
  • modify the Contribution,
  • prepare derivative works based upon or containing the Contribution and/or to combine the Contribution with other Materials,
  • reproduce the Contribution in original or modified form,
  • distribute, to make the Contribution available to the public, display and publicly perform the Contribution in original or modified form.

This license back is limited to the Contribution and does not provide any rights to the Material.

3. Patents

3.1 Patent license

Subject to the terms and conditions of this Agreement You hereby grant to Us and to recipients of Materials distributed by Us a worldwide, royalty-free, non-exclusive, perpetual and irrevocable (except as stated in Section 3.2) patent license, with the right to transfer an unlimited number of non-exclusive licenses or to grant sublicenses to third parties, to make, have made, use, sell, offer for sale, import and otherwise transfer the Contribution and the Contribution in combination with any Material (and portions of such combination). This license applies to all patents owned or controlled by You, whether already acquired or hereafter acquired, that would be infringed by making, having made, using, selling, offering for sale, importing or otherwise transferring of Your Contribution(s) alone or by combination of Your Contribution(s) with any Material.

3.2 Revocation of patent license

You reserve the right to revoke the patent license stated in section 3.1 if We make any infringement claim that is targeted at your Contribution and not asserted for a Defensive Purpose. An assertion of claims of the Patents shall be considered for a "Defensive Purpose" if the claims are asserted against an entity that has filed, maintained, threatened, or voluntarily participated in a patent infringement lawsuit against Us or any of Our licensees.

4. Disclaimer

THE CONTRIBUTION IS PROVIDED "AS IS". MORE PARTICULARLY, ALL EXPRESS OR IMPLIED WARRANTIES INCLUDING, WITHOUT LIMITATION, ANY IMPLIED WARRANTY OF SATISFACTORY QUALITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT ARE EXPRESSLY DISCLAIMED BY YOU TO US AND BY US TO YOU. TO THE EXTENT THAT ANY SUCH WARRANTIES CANNOT BE DISCLAIMED, SUCH WARRANTY IS LIMITED IN DURATION AND EXTENT TO THE MINIMUM PERIOD AND EXTENT PERMITTED BY LAW.

5. Consequential damage waiver

TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT WILL YOU OR WE BE LIABLE FOR ANY LOSS OF PROFITS, LOSS OF ANTICIPATED SAVINGS, LOSS OF DATA, INDIRECT, SPECIAL, INCIDENTAL, CONSEQUENTIAL AND EXEMPLARY DAMAGES ARISING OUT OF THIS AGREEMENT REGARDLESS OF THE LEGAL OR EQUITABLE THEORY (CONTRACT, TORT OR OTHERWISE) UPON WHICH THE CLAIM IS BASED.

6. Approximation of disclaimer and damage waiver

IF THE DISCLAIMER AND DAMAGE WAIVER MENTIONED IN SECTION 4. AND SECTION 5. CANNOT BE GIVEN LEGAL EFFECT UNDER APPLICABLE LOCAL LAW, REVIEWING COURTS SHALL APPLY LOCAL LAW THAT MOST CLOSELY APPROXIMATES AN ABSOLUTE WAIVER OF ALL CIVIL OR CONTRACTUAL LIABILITY IN CONNECTION WITH THE CONTRIBUTION.

7. Term

7.1 This Agreement shall come into effect upon Your acceptance of the terms and conditions.

7.3 In the event of a termination of this Agreement Sections 4, 5, 6, 7 and 8 shall survive such termination and shall remain in full force thereafter. For the avoidance of doubt, Free and Open Source Software (sub)licenses that have already been granted for Contributions at the date of the termination shall remain in full force after the termination of this Agreement.

8 Miscellaneous

8.1 This Agreement sets out the entire agreement between You and Us for Your Contributions to Us and overrides all other agreements or understandings.

8.2 In case of Your death, this agreement shall continue with Your heirs. In case of more than one heir, all heirs must exercise their rights through a commonly authorized person.

8.3 If any provision of this Agreement is found void and unenforceable, such provision will be replaced to the extent possible with a provision that comes closest to the meaning of the original provision and that is enforceable. The terms and conditions set forth in this Agreement shall apply notwithstanding any failure of essential purpose of this Agreement or any limited remedy to the maximum extent possible under law.

8.4 You agree to notify Us of any facts or circumstances of which you become aware that would make this Agreement inaccurate in any respect.

- + \ No newline at end of file diff --git a/docusaurus/assets/js/8439a13f.c361c36e.js b/docusaurus/assets/js/8439a13f.c361c36e.js new file mode 100644 index 000000000..1151e9aa8 --- /dev/null +++ b/docusaurus/assets/js/8439a13f.c361c36e.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkkuzu_docs=self.webpackChunkkuzu_docs||[]).push([[7131],{3905:(a,e,t)=>{t.d(e,{Zo:()=>l,kt:()=>k});var n=t(7294);function s(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function m(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,n)}return t}function r(a){for(var e=1;e=0||(s[t]=a[t]);return s}(a,e);if(Object.getOwnPropertySymbols){var m=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(s[t]=a[t])}return s}var i=n.createContext({}),o=function(a){var e=n.useContext(i),t=e;return a&&(t="function"==typeof a?a(e):r(r({},e),a)),t},l=function(a){var e=o(a.components);return n.createElement(i.Provider,{value:e},a.children)},h="mdxType",N={inlineCode:"code",wrapper:function(a){var e=a.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(a,e){var t=a.components,s=a.mdxType,m=a.originalType,i=a.parentName,l=p(a,["components","mdxType","originalType","parentName"]),h=o(t),c=s,k=h["".concat(i,".").concat(c)]||h[c]||N[c]||m;return t?n.createElement(k,r(r({ref:e},l),{},{components:t})):n.createElement(k,r({ref:e},l))}));function k(a,e){var t=arguments,s=e&&e.mdxType;if("string"==typeof a||s){var m=t.length,r=new Array(m);r[0]=c;var p={};for(var i in e)hasOwnProperty.call(e,i)&&(p[i]=e[i]);p.originalType=a,p[h]="string"==typeof a?a:s,r[1]=p;for(var o=2;o{t.r(e),t.d(e,{assets:()=>h,contentTitle:()=>o,default:()=>d,frontMatter:()=>i,metadata:()=>l,toc:()=>N});var n=t(7462),s=(t(7294),t(3905)),m=t(6477),r=t(2045),p=t(4287);const i={slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},o="RAG Using Structured Data: Overview & Important Questions",l={permalink:"/docusaurus/blog/llms-graphs-part-1",source:"@site/blog/2024-01-04-llms-graphs-part-1/index.md",title:"RAG Using Structured Data: Overview & Important Questions",description:"During the holiday season, I did some reading on",date:"2024-01-04T00:00:00.000Z",formattedDate:"January 4, 2024",tags:[{label:"use-case",permalink:"/docusaurus/blog/tags/use-case"}],readingTime:24.84,hasTruncateMarker:!1,authors:[{name:"Semih Saliho\u011flu",title:"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo",url:"https://cs.uwaterloo.ca/~ssalihog/",imageURL:"https://kuzudb.com/img/blog/semih.jpg",key:"semih"}],frontMatter:{slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},nextItem:{title:"K\xf9zu 0.1.0 Release",permalink:"/docusaurus/blog/kuzu-0.1.0-release"}},h={authorsImageUrls:[void 0]},N=[{value:"Killer App: Retrieval Augmented Generation",id:"killer-app-retrieval-augmented-generation",level:2},{value:"A note on the term RAG",id:"a-note-on-the-term-rag",level:3},{value:"RAG Using Structured Data: Text-to-High-level-Query",id:"rag-using-structured-data-text-to-high-level-query",level:2},{value:"Overview",id:"overview",level:3},{value:"Simplicity of Developing RAG Systems: LangChain and LlamaIndex",id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex",level:3},{value:"How Good Are LLMs in Generating High-Level Queries?",id:"how-good-are-llms-in-generating-high-level-queries",level:3},{value:"data.world Paper and Some Interesting Questions",id:"dataworld-paper-and-some-interesting-questions",level:2},{value:"Final Words",id:"final-words",level:2}],c={toc:N},k="wrapper";function d(a){let{components:e,...t}=a;return(0,s.kt)(k,(0,n.Z)({},c,t,{components:e,mdxType:"MDXLayout"}),(0,s.kt)("p",null,"During the holiday season, I did some reading on\nLLMs and specifically on the techniques that use LLMs together with graph databases and knowledge graphs.\nIf you are new to the area like me, the amount of activity on this topic on social\nmedia as well as in research publications may have intimidated you.\nIf so, you're exactly my target audience for this new blog post series I am starting.\nMy goals are two-fold: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview the area"),": I want to present what I learned with a simple and consistent terminology and at\na more technical depth than you might find in other blog posts. I am aiming a depth similar to what I aim when preparing\na lecture. I will link to many quality and technically satisfying pieces of content (mainly papers since the area is very researchy)."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview important future work"),": I want to cover several important future works in the space. I don't\nnecessarily mean work for research contributions but also simple approaches to experiment with if you are\nbuilding question answering (Q&A) applications using LLMs and graph technology.")),(0,s.kt)("p",null,"This post covers the topic of retrieval augmented generation (RAG) using structured data. Then, in a follow up post,\nI will cover RAG using unstructured data, where\nI will also mention a few ways people are building RAG-based Q&A\xa0systems that use both structured and unstructured data."),(0,s.kt)("admonition",{title:"TL;DR: The key takeaways from this post are:",type:"tip"},(0,s.kt)("ul",{parentName:"admonition"},(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"RAG overview"),": RAG is a technique to fill the knowledge gap of LLMs using private data. RAG systems\nuse private structured records stored in a database and/or unstructured data in text files. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Impressive simplicity and effectiveness of developing a natural language interface over your database using LLMs"),": In the pre-LLM era, the amount of engineering effort\nto develop a pipeline that delivered a natural language interface over your database was ",(0,s.kt)("em",{parentName:"li"},"immense"),". The\nhard problem was to teach a model to ",(0,s.kt)("em",{parentName:"li"},"speak"),' SQL, Cypher, or SPARQL.\nThis contrasts sharply with the simplicity of developing similar pipelines now because LLMs already "speak" these languages.\nThe hard task now is for ',(0,s.kt)("em",{parentName:"li"},"developers to learn how to prompt LLMs")," to get correct database queries. Furthermore, there is\nevidence that LLMs, if prompted correctly, will generate a decent proportion of queries with impressive accuracy. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Lack of work that studies LLMs' ability to generate Cypher or SPARQL:")," Most technically-deep work on understanding\nLLMs' ability to generate accurate high-level query languages is on SQL. We need more\nwork understanding the behavior of LLMs on the query languages of GDBMSs (like Cypher or SPARQL), specifically on recursive and union-of-join queries."),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Studying the effects of data modeling (normalization, views, graph modeling) on the accuracy of LLM-generated queries is important:"),"\nMany people are studying heuristics for prompting LLMs to increase their efficiency focusing on the syntax and the structure of providing\nthe schema and selection of examples in the prompt. An important and under-studied\nproblem is the effects of data modeling choices on the accuracy of the queries generated by LLMs. I point to ",(0,s.kt)("a",{parentName:"li",href:"https://arxiv.org/pdf/2311.07509.pdf"},"one interesting paper")," in this space and raise several questions related to\nnormalizations and use of views in relational modeling and comparisons with graph modeling approaches. "))),(0,s.kt)("h2",{id:"killer-app-retrieval-augmented-generation"},"Killer App: Retrieval Augmented Generation"),(0,s.kt)("p",null,"Let's review the killer application of LLMs in enterprises.\nThe application is ultimately Q&A over private enterprise data. Think of a chatbot to which you\ncan ask natural language questions (",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),'), such as: "Who is our top paying customer from Waterloo?",\nor "What are data privacy regulations in Canada we need to comply with?"\nand get back natural language answers (',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),").\nLLMs, out of the box, cannot answer these questions because they have a ",(0,s.kt)("em",{parentName:"p"},"knowledge gap"),".\nFor example, LLMs never had any access to your sales records when they were trained.\nTherefore, they need to retrieve or be provided with\nextra information from private data sources of the enterprise."),(0,s.kt)("h3",{id:"a-note-on-the-term-rag"},"A note on the term RAG"),(0,s.kt)("p",null,"There seems to be tremendous interest in building systems that combine a traditional\ninformation retrieval component, e.g., one that looks up some documents from\nan index, with a natural language generator component, such as an LLM. The term for such systems is\n",(0,s.kt)("em",{parentName:"p"},"Retrieval Augmented Generation")," (RAG).\nThe term is coined in ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2005.11401.pdf"},"this paper"),' to refer\nto the method of fine-tuning an LLM with additional information, i.e.,\nusing this additional data to train a new variant of the LLM.\nThe original usage form in the paper is "RAG models". Nowadays it is used in a variety of ways,\nsuch as, "RAG system", "RAG-based system", "RAG does X", or\n"Building RAG with Y". RAG often does not refer to fine-tuning LLMs any more. Instead, it\nrefers to providing LLMs with private data along with the question to fix the knowledge gap.\nEven systems that simply use an LLM to convert a\n',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL or Cypher query and simply return the results of the query\nare called "RAG systems" in some documentations. I will use the term in this broader sense.'),(0,s.kt)("p",null,"You can build RAG-based Q&A systems by using structured and/or unstructured\ndata. The high-level views of these systems look like this:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:m.Z,width:"600"})),(0,s.kt)("h2",{id:"rag-using-structured-data-text-to-high-level-query"},"RAG Using Structured Data: Text-to-High-level-Query"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},'Note: If you are familiar with how to develop RAG systems with LangChain and LlamaIndex, you can directly skip\nto the "',(0,s.kt)("a",{parentName:"em",href:"#how-good-are-llms-in-generating-high-level-queries"},"How Good are LLMs in Generating High-level Queries"),'" part that\nreflects on the reading I did on RAG using structured data.')),(0,s.kt)("h3",{id:"overview"},"Overview"),(0,s.kt)("p",null,"Many blog posts and several papers concern Q&A systems that simply convert\n",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query languge, such as SQL, Cypher, or SPARQL, using an LLM.\nThe figure below describes the overall approach:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:r.Z,width:"600"})),(0,s.kt)("p",null,(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),', the schema of a database, and optionally\nsome example natural language question and high-level query examples, are given\nto the LLM as a prompt.\nThe terms "no shot", "one shot", or "few shot" refer to the number of examples provided\nin the prompt. Depending on the underlying database, the schema may contain\ncolumns of relational tables and their descriptions, or labels of nodes and edges\nof a graph database. Using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", the database schema, and optionally\nsome examples, the LLM generates\na database query, such as SQL or Cypher. The system runs this query against the\nDBMS and returns back the query result or using the LLM again, converts\nthe query result back to a natural language answer ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". "),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"Let us pause here to appreciate one thing:")," For many decades, the database community has studied the problem\nof converting ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL (aka "text-to-SQL"). Here is a good recent ',(0,s.kt)("a",{parentName:"p",href:"https://link.springer.com/article/10.1007/s00778-022-00776-8"},"survey paper"),"\nthat covers only the deep network-based approaches and ",(0,s.kt)("a",{parentName:"p",href:"https://www.nowpublishers.com/article/Details/DBS-078"},"a more extensive survey/book"),"\non the broader topic of natural language interfaces to databases.\nNeither of these surveys cover any work that directly uses LLMs such as GPT models,\nwhich are quite recent developments. Take any of the work covered in these surveys and\nyou'll find an approach that requires significant engineering to build the pipeline shown in the above figure.\nThere exist several pre-LLM text-to-SQL systems (e.g., ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA"),"\nor ",(0,s.kt)("a",{parentName:"p",href:"https://download.hrz.tu-darmstadt.de/pub/FB20/Dekanat/Publikationen/UKP/76500354.pdf"},"BELA"),").\nFor example, most of the pre-LLM approaches that use deep learning require\nhard work ",(0,s.kt)("em",{parentName:"p"},'to teach a model how to "speak" SQL')," using large\ncorpora of tables and (question, query) examples, such as ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1709.00103"},"WikiSQL")," or ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/taoyds/spider"},"Spider"),".\nPeople had to solve and glue-together solutions to many technical problems, such as parsing the question,\nentity detection, synonym finding, string similarity, among others.\nPost-LLM approaches require ",(0,s.kt)("em",{parentName:"p"},"none")," of these efforts because LLMs, such as GPT-4, already speak SQL, Cypher, and SPARQL out of the box, having been exposed to them in their pretraining.\nNowadays, the hard problem now is for developers ",(0,s.kt)("em",{parentName:"p"},"to learn how to prompt LLMs")," so that\nLLMs generate correct queries. I'll say more about this problem. In contrast, building the above pipeline requires much less effort as\nI'll show next."),(0,s.kt)("h3",{id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex"},"Simplicity of Developing RAG Systems: LangChain and LlamaIndex"),(0,s.kt)("p",null,"If you have been following the developments in the LLM space, you will not be surprised to hear that nowadays people build\nQ&A systems that convert ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query language using two common tools:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://www.langchain.com/"},"LangChain"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://www.llamaindex.ai/"},"LlamaIndex"),".\nThe same tools also integrate with the underlying storage system to load and retrieve your data. To make this more concrete, let me review the ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"K\xf9zu-LangChain integration"),", similar to the integrations found in other GDBMSs. You as a programmer have very little to do: you prepare your K\xf9zu\ndatabase ",(0,s.kt)("inlineCode",{parentName:"p"},"db")," and load your data into it, wrap it around a ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," objects in Python and you have\na text-to-Cypher pipeline:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'import kuzu\nfrom langchain.chains import KuzuQAChain\nfrom langchain_community.chat_models import ChatOpenAI\nfrom langchain_community.graphs import KuzuGraph\n\ndb = kuzu.Database("test_db")\n... // create your graph if needed\ngraph = KuzuGraph(db)\nchain = KuzuQAChain.from_llm(ChatOpenAI(temperature=0), graph=graph, verbose=True)\nchain.run("Who played in The Godfather: Part II?")\n')),(0,s.kt)("p",null,"I am following the example application in this ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"documentation"),",\nwhich uses a database of movies, actors, and directors. "),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Output:\n> Entering new chain...\nGenerated Cypher:\nMATCH (p:Person)-[:ActedIn]->(m:Movie {name: 'The Godfather: Part II'}) RETURN p.name\nFull Context:\n[{'p.name': 'Al Pacino'}, {'p.name': 'Robert De Niro'}]\n\n> Finished chain.\n\n'Al Pacino and Robert De Niro both played in The Godfather: Part II.'\n")),(0,s.kt)("p",null,'The "chain" first generated a Cypher query using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nBehind the curtain, i.e., inside the KuzuQAChain code,\na GPT model was given the following prompt:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Generate Cypher statement to query a graph database.\nInstructions:\nUse only the provided relationship types and properties in the schema.\nDo not use any other relationship types or properties that are not provided.\n\nSchema:\nNode properties: [{'properties': [('name', 'STRING')], 'label': 'Movie'}, {'properties': [('name', 'STRING'), ('birthDate', 'STRING')], 'label': 'Person'}]\nRelationships properties: [{'properties': [], 'label': 'ActedIn'}]\nRelationships: ['(:Person)-[:ActedIn]->(:Movie)']\n\nNote: Do not include any explanations or apologies in your responses.\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\nDo not include any text except the generated Cypher statement.\n\nThe question is:\nWho played in The Godfather: Part II?\n")),(0,s.kt)("p",null,"Indeed, if you copy this prompt and paste it in ",(0,s.kt)("a",{parentName:"p",href:"https://chat.openai.com/"},"chatGPT's browser interface"),",\nyou will get the same or very similar Cypher query. The important point is: that's all\nthe coding you have to do to build a natural language interface that can query your database.\nYou ultimately construct a string prompt that contains ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", some\ninstructions, and schema of the database, and the LLM will generate a query for you.\nThe ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," are simple wrappers to do just that.\nIf you want to play around with how well this works on other datasets,\nwe have this pipeline implemented in K\xf9zu's browser frontend ",(0,s.kt)("a",{parentName:"p",href:"https://kuzudb.com/docusaurus/kuzuexplorer/"},"K\xf9zuExplorer"),". "),(0,s.kt)("p",null,'That is, for any database you have in K\xf9zu, you get a natural language interface over it in\nK\xf9zuExplorer (just click the "robot icon" on the left panel).\nYou can develop similar pipelines with other GDBMSs using similar interfaces (',(0,s.kt)("em",{parentName:"p"},"though I recommend using K\xf9zu as it will be the\nsimplest to get started")," \ud83d\ude09: ",(0,s.kt)("em",{parentName:"p"},"Unlike other GDBMSs, K\xf9zu is embeddable and requires no server set up"),").\nIf you instead want to build Q&A systems over your RDBMSs, you can use\nLangChain's ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-2-text-to-sql-query-and-execution"},"SQLDatabaseChain")," and\n",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-3-sql-agents"},"SQLAgent")," or\nLlamaIndex's ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/SQLIndexDemo.html#part-1-text-to-sql-query-engine"},"NLSQLTableQueryEngine"),'. The level of simplicity is similar to the example I presented. In practice, it is unlikely that your chatbot or search engine will be as simple\nas the above example where the application interacts with the LLM only once. If you want\nto interact with the LLM multiple times and conditionally take one action over another action etc.,\nLangChain and LlamaIndex also provide ways to do that through their "Agents" (see ',(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/modules/agents/"},"LangChain Agents")," and ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/use_cases/agents.html"},"Llama Index Agents"),")."),(0,s.kt)("h3",{id:"how-good-are-llms-in-generating-high-level-queries"},"How Good Are LLMs in Generating High-Level Queries?"),(0,s.kt)("p",null,"Although building a text-to-high-level-query-language pipeline is now very simple with LLMs,\nsimplicity ",(0,s.kt)("strong",{parentName:"p"},"does not")," mean quality. Indeed, people building these systems are now faced with the following two important questions: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How accurate are the high-level queries that LLMs generate?")),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How, e.g., through what types of prompts or data modeling, can we increase the accuracy of the\nqueries generated by LLMs?"))),(0,s.kt)("p",null,"Here are several papers on this that I suggest reading:"),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2303.13547.pdf"},"A comprehensive evaluation of ChatGPT\u2019s zero-shot Text-to-SQL capability"))," from Tsinghua University and University of Illinois at Chicago. "),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2204.00498.pdf"},"Evaluating the Text-to-SQL Capabilities of Large Language Models"))," from researchers from Cambridge and universities and institutes from Montr\xe9al."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2308.15363.pdf"},"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation"))," from Alibaba Group."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.12586.pdf"},"Enhancing Few-shot Text-to-SQL Capabilities of Large Language Models: A Study on Prompt Design Strategies"))," from Yale, Columbia, and Allen Institute for AI."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.11853.pdf"},"How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings"))," from Ohio State"),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2311.07509.pdf"},"A Benchmark to Understand the Role of Knowledge Graphs on LLM's Accuracy for Q&A on Enterprise SQL Databases"))," from data.world.")),(0,s.kt)("p",null,"These papers are either entirely or ",(0,s.kt)("em",{parentName:"p"},"almost")," entirely evaluation-only papers that experiment with very detailed approaches of prompting LLMs\nto generate SQL queries. First, let me say that the general message these\npapers give (maybe except the last one) is that LLMs are pretty good. With right prompting (or even with basic prompting)\nthey do very well on these benchmarks. I see accuracy rates over 85% on the Spider benchmark in several papers. These are clearly\nbetter numbers than what pre-LLM state-of-the-art systems achieved. This should be impressive to many."),(0,s.kt)("p",null,'Second, the set of techniques are too detailed to cover here but some example heuristics\nthese papers experiment with include the following: (i) the syntax used for providing the schema\n(apparently putting "the pound sign ',(0,s.kt)("inlineCode",{parentName:"p"},"#")," to differentiate prompt from response in examples yields impressive performance gains\" \ud83d\ude00 go figure); (ii)\nthe number and selection of example (question, SQL) pairs, e.g., apparently there is a sweet spot in the number\nof examples to provide; or (iii) the effects of standardizing the text in the prompt, e.g., indenting and using all lower case letters consistently\n(apparently has minor but some effect). Yes, as interesting and important it is to learn how to use LLMs better, I still\ncan't escape the following thought before going to bed: somewhere out there, some advisor might be torturing some graduate student\nto check if the magical box produces better SQL with a pound sign vs double slashes!"),(0,s.kt)("p",null,"Most work I found is on generating SQL.\nIn contrast, I found no papers that do similar prompting study for query languages\nof GDBMS though I ran into two papers that are providing benchmarks for query languages of GDBMSs:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2309.16248"},"SPARQL"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://dl.acm.org/doi/pdf/10.1145/3511808.3557703"},"Cypher"),").\nSo a low-hanging fruit future work is the following:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 1: Similar prompting studies for query languages of graph DBMSs with a focus on recursive and unions of joins queries."),":\nIn contrast to SQL queries, here, one should study various recursive queries that the query languages of GDBMSs are particularly good\nat and union-of-join queries which are asked by omitting labels in the query languages of GDBMSs.\nFor example if you want to ask all connections between\nyour ",(0,s.kt)("inlineCode",{parentName:"p"},"User")," nodes and User can have many relationships, such as ",(0,s.kt)("inlineCode",{parentName:"p"},"Follows"),", ",(0,s.kt)("inlineCode",{parentName:"p"},"SentMoneyTo"),", or ",(0,s.kt)("inlineCode",{parentName:"p"},"SameFamily"),",\nyou would have to write 3 possible join queries in SQL and union them. Instead, you can write this query\nwith a very simple syntax in Cypher as\n",(0,s.kt)("inlineCode",{parentName:"p"},"MATCH (a:User)-[e]->(b:User)"),", where the omissions of the label on the relationship ",(0,s.kt)("inlineCode",{parentName:"p"},"e")," indicates searching over\nall possible joins.",(0,s.kt)("sup",{parentName:"p",id:"fnref-1-a733ad"},(0,s.kt)("a",{parentName:"sup",href:"#fn-1-a733ad",className:"footnote-ref"},"1"))," "),(0,s.kt)("p",null,"As a side note: In the context of any query language, including SQL, questions that require sub-queries are of particular\ninterest as they are generally harder to write. Some of the papers I read had sections analyzing the performance of\nLLMs on nested queries but the focus was not on these. In prior literature there are papers written solely on text-to-SQL generation for\nnested queries (e.g., see ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol13/p2747-sen.pdf"},"the ATHENA++ paper"),"). I am certain someone\nsomewhere is already focusing solely on nested queries and that's a good idea."),(0,s.kt)("h2",{id:"dataworld-paper-and-some-interesting-questions"},"data.world Paper and Some Interesting Questions"),(0,s.kt)("p",null,"In the remainder of the post I want to review ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2311.07509.pdf"},"the benchmark paper")," from ",(0,s.kt)("inlineCode",{parentName:"p"},"data.world")," that focuses on text-to-SQL using LLMs. Unlike other papers out there that\nstudy the effects of different prompting heuristics, this paper studies the ",(0,s.kt)("em",{parentName:"p"},"effects of data modeling\non the accuracy of SQL queries generated by LLMs"),", which is closely related to GDBMSs. "),(0,s.kt)("p",null,"Specifically, this paper is an evaluation of the performance of GPT-4 in generating SQL using no examples, i.e., zero-shot,\nwith basic prompting over a standardized insurance database schema\ncalled The ",(0,s.kt)("a",{parentName:"p",href:"https://www.omg.org/spec/PC/1.0/About-PC"},"OMG Property and Casualty Data Model"),".\nSee Figure 1 in the paper (omitted here) for the conceptual schema, which consists of classes such as\nPolicy, Account, Claims, Insurable Object, among others, and their relationships.\nThe paper has a benchmark of 43 natural language questions and compares 2 approaches to generate the SQL query.\nThe below figure shows an overview of these approaches for reference:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:p.Z,width:"600"})),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},"Direct SQL Generation: In this approach, ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," and the relational schema of the OMG database is given\nto GPT-4. The schema is given in terms of ",(0,s.kt)("inlineCode",{parentName:"li"},"CREATE TABLE")," statements, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre",className:"language-sql"},"CREATE TABLE Claim(\nClaim_Identifier int NOT NULL,\nCatastrophe_Identifier int NULL,\n...\nClaim_Open_Date datetime NULL ,\n ...\n PRIMARY KEY (Claim_Identifier ASC),\n FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier),\n...)\n")),"The full schema statements can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"here"),".\nGPT-4 is asked to generate a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to answer ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nCopy-pasted from the paper, these prompts look as follows:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the database described by the following DDL:\n\nWrite a SQL query that answers the following question. Do not explain the query. return just the query, so it can be run\nverbatim from your response.\nHere\u2019s the question:\n\n"))),(0,s.kt)("li",{parentName:"ol"},"Indirect SQL Generation via Graph Modeling/SPARQL: In this approach, instead of the relational schema of the database, the same\ndatabase is modeled as an ",(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://www.w3.org/OWL/"},"OWL ontology"))," (OWL is short for Web Ontology Language).\nOntology is another term for schema when modeling data as graph as classes and relationships between them. OWL is a W3C standard\nand part of the RDF technology stack so OWL ontologies are expressed as a set RDF triples, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},'...\nin:Claim rdf:type owl:Class ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim" .\nin:claimOpenDate rdf:type owl:DatatypeProperty ;\n rdfs:domain in:Claim ;\n rdfs:range xsd:dateTime ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim Open Date" .\nin:hasCatastrophe rdf:type owl:ObjectProperty ;\n rdfs:domain in:Claim ;\n rdfs:range in:Catastrophe ;\n rdfs:isDefinedBy ;\n rdfs:label "has catastrophe" .\n...\n')),"The full ontology can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"here"),".\nGPT-4 is then asked to generate a SPARQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", instead of SQL, for the same ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". The full prompt, again copy-pasted\nfrom the paper with some simplifications, looks like this:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the OWL model described in the following TTL file:\n\nWrite a SPARQL query that answers the question. Do not explain the query. return just the query, so it can be run verbatim from your response.\nHere\u2019s the question:\n\n")),"As a last step, the authors have a direct mapping from ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". This is a quite straigh-forward step\nas the modeling as an ontology vs relational schema have direct translations from classes and properties to tables and columns.")),(0,s.kt)("p",null,"An interesting comparison. There is some intuition for why one would be interested in the effectiveness of\nquery generation through an ontology because one of the well-known\npre-LLM text-to-SQL papers ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA")," did something similar.\nInstead of SPARQL they had another query language over an ontology called Ontology Query Language, which\nwas then mapped to SQL. "),(0,s.kt)("p",null,"The results are even more interesting. The authors categorize their 43 questions into\n4 quadrants based on 2 dimensions: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"question")," complexity: Questions that require only simple projections\nare low complexity. Those that require aggregations or math functions are high complexity."),(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"schema")," complexity: Questions whose SQL queries require up to 4 tables are low schema complexity. Those that\nrequire 5 or more joins are high schema complexity. ")),(0,s.kt)("p",null,'The accuracy results are shown below. Accuracy here is "execution accuracy" meaning that only the answers of the queries\nare checked against the ground truth answer. That is, even if the SQL query GPT-4 generated was actually not correct\nbut by luck it computed the correct answers the paper takes it as correct (apparently happens very rarely in this study).'),(0,s.kt)("table",null,(0,s.kt)("thead",{parentName:"table"},(0,s.kt)("tr",{parentName:"thead"},(0,s.kt)("th",{parentName:"tr",align:null},"Overall: 16.7% vs 54.2%"),(0,s.kt)("th",{parentName:"tr",align:null},"Low Schema Complexity"),(0,s.kt)("th",{parentName:"tr",align:null},"High Schema Complexity"))),(0,s.kt)("tbody",{parentName:"table"},(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"Low Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"37.4% vs 66.9%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 38.7%")),(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"High Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"25.5% vs 71.1%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 35.7%")))),(0,s.kt)("p",null,"Overall, the indirect SQL generation method through SPARQL is much more effective in this zero-shot setting.\nNot surprisingly, questions that require 5 or more joins are harder regardless of the\nmethod used and direct SQL cannot get any of those questions right. These are interesting\nresults for an initial study on the effects of data modeling on LLMs' accuracy on generating database queries.\nThese results should give many researchers and practitioners ideas about how to replicate\nand validate/invalidate similar results under different settings, e.g., with few-shot\nexamples and under different databases."),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"That said, one should ask, why?")," In fact, we should all be suspicious that merely modeling the\nsame set of records with a different abstraction should have any visible effects. After all, by modeling\nthe same records differently, one does not obtain or lose information. So if and when LLMs are smart enough,\nthey shouldn't care how the data was modeled. But for now, if a pound sign can make a difference,\nwe should not be surprised modeling choices can have large impacts. As such, it is healthy to be suspicious\nand ask why. These motivate a few important questions I think are worth studying. My premise\nis that somehow if the differences are this large, it must be that the task for GPT-4 got simpler when\nasked to generate a SPARQL query. I can hypothesize about a few possible reasons for this: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("p",{parentName:"li"},(0,s.kt)("em",{parentName:"p"},"Some queries require fewer tokens to write in SPARQL"),": One difference the query languages\nof GDBMSs often have is that certain equality conditions are implicit in the syntax, which\nmeans their ",(0,s.kt)("inlineCode",{parentName:"p"},"WHERE")," clauses are simpler for some queries. For example if you wanted to return\nthe names of the Catastrophe that Claim with ID Claim1 has, in SPARQL you can write it as:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?name\nWHERE { in:hasCatastrophe ?catastrophe,\n ?catastophe in:catastropheName ?name}\n")),(0,s.kt)("p",{parentName:"li"},"In SQL you would write:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Catastrophe_Name\nFROM Claim, Catastrophe\nWHERE Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier AND\n Claim.Claim_Identifier = Claim1\n")),(0,s.kt)("p",{parentName:"li"},"Note that the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Claim_Identifier = Claim1")," equality condition is implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"}," in:hasCatastrophe ?catastrophe")," triple\nand the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier")," condition is implicit in the fact that ",(0,s.kt)("inlineCode",{parentName:"p"},"?catastrophe")," appears\nboth in the first and second triples in the SPARQL query. Such implicit equality conditions are common in the languages of\ngraph query languages especially when expressing joins. For example in Cypher you can omit all join conditions in WHERE clauses as long\nas those joins have been pre-defined to the system as relationships. Instead you join records through the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," syntax.\nIt's unclear how much this could matter but it is an immediate advantage of SPARQL that can explain why complex join queries are easier to generate\nin SPARQL than SQL. "),(0,s.kt)("p",{parentName:"li"},(0,s.kt)("strong",{parentName:"p"},"Side note"),": On the flip side, SPARQL can be more verbose in projections. For example, if you wanted to return the number, open and close\ndates of every claim, you'd write the following SQL query:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Claim_Number, Claim_Open_Date, Claim_Close_Date\nFROM Claim\n")),(0,s.kt)("p",{parentName:"li"},"In SPARQL, you'd have to write both the names of the property you want to project and give it an additional variable as follows:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?number, ?open_date, ?close_date\nWHERE { ?claim in:claimNumber ?number,\n ?claim in:claimOpenDate ?open_date,\n ?claim in:claimCloseDate ?close_date\n")))),(0,s.kt)("ol",{start:2},(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Graph modeling gives explicit names to foreign keys:")," There is a reason that database courses teach database modeling to students\nusing graph-based models, such as Entity-Relationship or UML models. First, humans think of the world\nas objects/entities and their relationships. In some sense, these are higher-level models where relationships\nbetween objects are denoted explicitly with explicit names (instead of as less explicit foreign key constraints).\nFor example, the implicit connection between Claims and\nCatastrophes through the ",(0,s.kt)("inlineCode",{parentName:"li"},"FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier)"),"\nconstraint was given an explicit English name: ",(0,s.kt)("inlineCode",{parentName:"li"},"hasCatastrophe")," in the ontology. This explicitness may make\nit easier for LLMs to understand the schema and generate SPARQL queries.")),(0,s.kt)("p",null,"Both of these are qualitative hypotheses. however, there is a more immediate\nreason the authors of this paper may have obtained such major differences between the two approaches they tried.\nIntentionally or unintentionally, their ontology is simplified significantly compared to the relational schema they have.\nFor example, the Claim relation has ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Reopen_Date")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Status_Code")," properties which are removed from the ontology.\nMany such properties from the relations seem to have been removed, and the ontology overall looks simpler.\nThere are also several differences between the ontology and the relational schema that are confusing. For example\nthe ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"ontology"),"\nhas a class ",(0,s.kt)("inlineCode",{parentName:"p"},"Agent")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Policy")," objects are ",(0,s.kt)("inlineCode",{parentName:"p"},"in:soldByAgent")," by some Agent objects (see lines 20 and 92). I cannot\nsee corresponding relations or columns in the ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"relational schema"),". Unless I am missing something about how the prompts were given,\nthese are also likely to have important effects on the results and someone should fix and obtain new results\nin a more fair comparison."),(0,s.kt)("p",null,"Let me next raise several high-level questions that I think are important:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2: Rules of thumbs in data modeling to make LLM-generated queries more accurate."),"\nI think the higher-level question of studying the effects of data modeling in more depth is a very good direction.\nAs LLMs get smarter, I would expect that the presence/absence of a pound sign or the style of English\nshould matter less. These look more like syntactic differences that can be automatically detected over time.\nModeling choices are more fundamental and relate to the clarity and understandibility of the records that will be queried by the LLM.\nSo identifying some rules of thumb here looks like the promising path forward. Let me list a few immediate questions one can study:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.1: Effects of normalization/denormalization.")," If the shortcoming of GPT-4 is\ngenerating queries with many joins, one way to solve this is to denormalize the relations into fewer\ntables and study its effects. Again, I'm thinking of same records just modeled differently with fewer\ntables. What happens if we reduce all data into a single table with dozens of columns and many value repetitions?\nNow all possible joins would have been performed so we'd force the LLM to write a join-less query with\nfilters, distincts, and aggregations. What happens if we normalize the tables step-by-step until we\nget to a well known form, such as ",(0,s.kt)("a",{parentName:"p",href:"https://en.wikipedia.org/wiki/Boyce%E2%80%93Codd_normal_form"},"Boyce-Codd Normal Form"),"? Do we consistently get better or worse accuracy?"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.2: Use of views.")," In relational modeling, views are an effective way to have higher\nand simpler modeling of your records. Similar to a ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline,\none can test the effectiveness of ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"o"),(0,s.kt)("mi",{parentName:"mrow"},"v"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"r"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"V"),(0,s.kt)("mi",{parentName:"mrow"},"i"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"w"),(0,s.kt)("mi",{parentName:"mrow"},"s")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL-over-Views}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"o"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.03588em"}},"v"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02778em"}},"er"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"Vi"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"e"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02691em"}},"w"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"s"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline."),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 3: Use of Cypher as intermediate query language to translate to SQL.")," One reason to experiment with Cypher\nin addition to SPARQL is that Cypher is, arguably, more similar to SQL than SPARQL but has the advantage that (common) join\nconditions are implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," node-arrow syntax. Yet Cypher does not have the verbosity of the SPARQL projections\nI mentioned above (so you project properties the same way you project columns in SQL). In my world, all high-level query languages\nlook very similar to SQL, so eventually when LLMs are smart enough, or even today, I think these language differences\nshould have minor effects. However, graph query languages will likely continue to have major advantages when writing\nrecursive queries, as they have specialized syntax (e.g., Cypher has the Kleene star syntax) to do so. For those queries,\nexpressing first in Cypher and then mapping to SQL could lead to an advantage. "),(0,s.kt)("h2",{id:"final-words"},"Final Words"),(0,s.kt)("p",null,"Needless to say, in the next few years, the field will be flooded with work on how to\nuse LLMs to solve the text-to-high-level-query problem. Many rules of thumb will emerge\nabout how to prompt them correctly. The questions one can ask in this space is endless.\nI can speculate about it a lot, but I think it's plausible that\nmany of these rules of thumb, specifically the syntactic\ndifferences in prompting, can become\nobsolete very quickly as newer and more advanced LLMs that are better at speaking high-level database languages emerge.\nFor example, it's plausible that people will stop showing LLMs example (question, query) pairs each time they ask them to generate\nSQL once LLMs are better at speaking SQL."),(0,s.kt)("p",null,"However, the harder question of how to model the data so that its meaning is clear, and the\nqueries that need to be written, are simpler, is more likely to remain a challenge for a longer time. I would not be too optimistic\nthat there can emerge very clear answers to this question. How to model your data is part-art and part-science.\nYet, some studiable questions, such as the effects of normalization, use of views or generating Cypher for recursive queries,\ncan yield some important best practices that can be useful to developers building these systems."),(0,s.kt)("p",null,"In the next post, I will cover what I learned about RAG over unstructured data. Graphs and knowledge graphs are playing\na more interesting role in that space. Until then, happy new year to all!"),(0,s.kt)("div",{className:"footnotes"},(0,s.kt)("hr",{parentName:"div"}),(0,s.kt)("ol",{parentName:"div"},(0,s.kt)("li",{parentName:"ol",id:"fn-1-a733ad"},"SPARQL syntax is different but a similar advantage exists by omitting type constraints.",(0,s.kt)("a",{parentName:"li",href:"#fnref-1-a733ad",className:"footnote-backref"},"\u21a9")))))}d.isMDXComponent=!0},6477:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/qa-over-enterprise-data-c63fb036791e4a0ec1f24d802d50254e.png"},2045:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/rag-using-structured-data-4e7c4e780c6a85b5664a3aa3f3f6c5a9.png"},4287:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/two-sql-generation-approaches-cceca4532513a6bb7ccae29e3f3ca94f.png"}}]); \ No newline at end of file diff --git a/docusaurus/assets/js/8439a13f.c6e535b4.js b/docusaurus/assets/js/8439a13f.c6e535b4.js deleted file mode 100644 index 632bb906a..000000000 --- a/docusaurus/assets/js/8439a13f.c6e535b4.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkkuzu_docs=self.webpackChunkkuzu_docs||[]).push([[7131],{3905:(a,e,t)=>{t.d(e,{Zo:()=>l,kt:()=>k});var n=t(7294);function s(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function m(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,n)}return t}function r(a){for(var e=1;e=0||(s[t]=a[t]);return s}(a,e);if(Object.getOwnPropertySymbols){var m=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(s[t]=a[t])}return s}var i=n.createContext({}),o=function(a){var e=n.useContext(i),t=e;return a&&(t="function"==typeof a?a(e):r(r({},e),a)),t},l=function(a){var e=o(a.components);return n.createElement(i.Provider,{value:e},a.children)},h="mdxType",N={inlineCode:"code",wrapper:function(a){var e=a.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(a,e){var t=a.components,s=a.mdxType,m=a.originalType,i=a.parentName,l=p(a,["components","mdxType","originalType","parentName"]),h=o(t),c=s,k=h["".concat(i,".").concat(c)]||h[c]||N[c]||m;return t?n.createElement(k,r(r({ref:e},l),{},{components:t})):n.createElement(k,r({ref:e},l))}));function k(a,e){var t=arguments,s=e&&e.mdxType;if("string"==typeof a||s){var m=t.length,r=new Array(m);r[0]=c;var p={};for(var i in e)hasOwnProperty.call(e,i)&&(p[i]=e[i]);p.originalType=a,p[h]="string"==typeof a?a:s,r[1]=p;for(var o=2;o{t.r(e),t.d(e,{assets:()=>h,contentTitle:()=>o,default:()=>d,frontMatter:()=>i,metadata:()=>l,toc:()=>N});var n=t(7462),s=(t(7294),t(3905)),m=t(6477),r=t(2045),p=t(4287);const i={slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},o="RAG Using Structured Data: Overview & Important Questions",l={permalink:"/docusaurus/blog/llms-graphs-part-1",source:"@site/blog/2024-01-04-llms-graphs-part-1/index.md",title:"RAG Using Structured Data: Overview & Important Questions",description:"During the holiday season, I did some reading on",date:"2024-01-04T00:00:00.000Z",formattedDate:"January 4, 2024",tags:[{label:"use-case",permalink:"/docusaurus/blog/tags/use-case"}],readingTime:25.13,hasTruncateMarker:!1,authors:[{name:"Semih Saliho\u011flu",title:"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo",url:"https://cs.uwaterloo.ca/~ssalihog/",imageURL:"https://kuzudb.com/img/blog/semih.jpg",key:"semih"}],frontMatter:{slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},nextItem:{title:"K\xf9zu 0.1.0 Release",permalink:"/docusaurus/blog/kuzu-0.1.0-release"}},h={authorsImageUrls:[void 0]},N=[{value:"Killer App: Retrieval Augmented Generation",id:"killer-app-retrieval-augmented-generation",level:2},{value:"A note on the term RAG",id:"a-note-on-the-term-rag",level:3},{value:"Summary of this post",id:"summary-of-this-post",level:3},{value:"RAG Using Structured Data: Text-to-High-level-Query",id:"rag-using-structured-data-text-to-high-level-query",level:2},{value:"Overview",id:"overview",level:3},{value:"Simplicity of Developing RAG Systems: LangChain and LlamaIndex",id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex",level:3},{value:"How Good Are LLMs in Generating High-Level Queries?",id:"how-good-are-llms-in-generating-high-level-queries",level:3},{value:"data.world Paper and Some Interesting Questions",id:"dataworld-paper-and-some-interesting-questions",level:2},{value:"Final Words",id:"final-words",level:2}],c={toc:N},k="wrapper";function d(a){let{components:e,...t}=a;return(0,s.kt)(k,(0,n.Z)({},c,t,{components:e,mdxType:"MDXLayout"}),(0,s.kt)("p",null,"During the holiday season, I did some reading on\nLLMs and specifically on the techniques that use LLMs together with graph databases and knowledge graphs.\nIf you are new to the area like me, the amount of activity on this topic on social\nmedia as well as in research publications may have intimidated you.\nIf so, you're exactly my target audience for this new blog post series I am starting.\nMy goals are two-fold: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview the area"),": I want to present what I learned with a simple and consistent terminology and at\na more technical depth than you might find in other blog posts. I am aiming a depth similar to what I aim when preparing\na lecture. I will link to many quality and technically satisfying pieces of content (mainly papers since the area is very researchy)."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview important future work"),": I want to cover several important future works in the space. I don't\nnecessarily mean work for research contributions but also simple approaches to experiment with if you are\nbuilding question answering (Q&A) applications using LLMs and graph technology.")),(0,s.kt)("h2",{id:"killer-app-retrieval-augmented-generation"},"Killer App: Retrieval Augmented Generation"),(0,s.kt)("p",null,"Let's review the killer application of LLMs in enterprises.\nThe application is ultimately Q&A over private enterprise data. Think of a chatbot to which you\ncan ask natural language questions (",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),'), such as: "Who is our top paying customer from Waterloo?",\nor "What are data privacy regulations in Canada we need to comply with?"\nand get back natural language answers (',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),").\nLLMs, out of the box, cannot answer these questions because they have a ",(0,s.kt)("em",{parentName:"p"},"knowledge gap"),".\nFor example, LLMs never had any access to your sales records when they were trained.\nTherefore, they need to retrieve or be provided with\nextra information from private data sources of the enterprise."),(0,s.kt)("h3",{id:"a-note-on-the-term-rag"},"A note on the term RAG"),(0,s.kt)("p",null,"There seems to be tremendous interest in building systems that combine a traditional\ninformation retrieval component, e.g., one that looks up some documents from\nan index, with a natural language generator component, such as an LLM. The term for such systems is\n",(0,s.kt)("em",{parentName:"p"},"Retrieval Augmented Generation")," (RAG).\nThe term is coined in ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2005.11401.pdf"},"this paper"),' to refer\nto the method of fine-tuning an LLM with additional information, i.e.,\nusing this additional data to train a new variant of the LLM.\nThe original usage form in the paper is "RAG models". Nowadays it is used in a variety of ways,\nsuch as, "RAG system", "RAG-based system", "RAG does X", or\n"Building RAG with Y". RAG often does not refer to fine-tuning LLMs any more. Instead, it\nrefers to providing LLMs with private data along with the question to fix the knowledge gap.\nEven systems that simply use an LLM to convert a\n',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL or Cypher query and simply return the results of the query\nare called "RAG systems" in some documentations. I will use the term in this broader sense.'),(0,s.kt)("p",null,"You can build RAG-based Q&A systems by using structured and/or unstructured\ndata. The high-level views of these systems look like this:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:m.Z,width:"600"})),(0,s.kt)("h3",{id:"summary-of-this-post"},"Summary of this post"),(0,s.kt)("p",null,"This post covers RAG using structured data. Then, in a follow up post, I will cover RAG using unstructured data, where\nI will also mention a few ways people are building RAG-based Q&A\xa0systems that use both structured and unstructured data."),(0,s.kt)("admonition",{title:"TL;DR: The key takeaways from this post are:",type:"tip"},(0,s.kt)("ul",{parentName:"admonition"},(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"RAG overview"),": RAG is a technique to fill the knowledge gap of LLMs using private data. RAG systems\nuse private structured records stored in a database and/or unstructured data in text files. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Impressive simplicity and effectiveness of developing a natural language interface over your database using LLMs"),": In the pre-LLM era, the amount of engineering effort\nto develop a pipeline that delivered a natural language interface over your database was ",(0,s.kt)("em",{parentName:"li"},"immense"),". The\nhard problem was to teach a model to ",(0,s.kt)("em",{parentName:"li"},"speak"),' SQL, Cypher, or SPARQL.\nThis contrasts sharply with the simplicity of developing similar pipelines now because LLMs already "speak" these languages.\nThe hard task now is for ',(0,s.kt)("em",{parentName:"li"},"developers to learn how to prompt LLMs")," to get correct database queries. Furthermore, there is\nevidence that LLMs, if prompted correctly, will generate a decent proportion of queries with impressive accuracy. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Lack of work that studies LLMs' ability to generate Cypher or SPARQL:")," Most technically-deep work on understanding\nLLMs' ability to generate accurate high-level query languages is on SQL. We need more\nwork understanding the behavior of LLMs on the query languages of GDBMSs (like Cypher or SPARQL), specifically on recursive and union-of-join queries."),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Studying the effects of data modeling (normalization, views, graph modeling) on the accuracy of LLM-generated queries is important:"),"\nMany people are studying heuristics for prompting LLMs to increase their efficiency focusing on the syntax and the structure of providing\nthe schema and selection of examples in the prompt. An important and under-studied\nproblem is the effects of data modeling choices on the accuracy of the queries generated by LLMs. I point to ",(0,s.kt)("a",{parentName:"li",href:"https://arxiv.org/pdf/2311.07509.pdf"},"one interesting paper")," in this space and raise several questions related to\nnormalizations and use of views in relational modeling and comparisons with graph modeling approaches. "))),(0,s.kt)("h2",{id:"rag-using-structured-data-text-to-high-level-query"},"RAG Using Structured Data: Text-to-High-level-Query"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},'Note: If you are familiar with how to develop RAG systems with LangChain and LlamaIndex, you can directly skip\nto the "',(0,s.kt)("a",{parentName:"em",href:"#how-good-are-llms-in-generating-high-level-queries"},"How Good are LLMs in Generating High-level Queries"),'" part that\nreflects on the reading I did on RAG using structured data.')),(0,s.kt)("h3",{id:"overview"},"Overview"),(0,s.kt)("p",null,"Many blog posts and several papers concern Q&A systems that simply convert\n",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query languge, such as SQL, Cypher, or SPARQL, using an LLM.\nThe figure below describes the overall approach:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:r.Z,width:"600"})),(0,s.kt)("p",null,(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),', the schema of a database, and optionally\nsome example natural language question and high-level query examples, are given\nto the LLM as a prompt.\nThe terms "no shot", "one shot", or "few shot" refer to the number of examples provided\nin the prompt. Depending on the underlying database, the schema may contain\ncolumns of relational tables and their descriptions, or labels of nodes and edges\nof a graph database. Using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", the database schema, and optionally\nsome examples, the LLM generates\na database query, such as SQL or Cypher. The system runs this query against the\nDBMS and returns back the query result or using the LLM again, converts\nthe query result back to a natural language answer ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". "),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"Let us pause here to appreciate one thing:")," For many decades, the database community has studied the problem\nof converting ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL (aka "text-to-SQL"). Here is a good recent ',(0,s.kt)("a",{parentName:"p",href:"https://link.springer.com/article/10.1007/s00778-022-00776-8"},"survey paper"),"\nthat covers only the deep network-based approaches and ",(0,s.kt)("a",{parentName:"p",href:"https://www.nowpublishers.com/article/Details/DBS-078"},"a more extensive survey/book"),"\non the broader topic of natural language interfaces to databases.\nNeither of these surveys cover any work that directly uses LLMs such as GPT models,\nwhich are quite recent developments. Take any of the work covered in these surveys and\nyou'll find an approach that requires significant engineering to build the pipeline shown in the above figure.\nThere exist several pre-LLM text-to-SQL systems (e.g., ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA"),"\nor ",(0,s.kt)("a",{parentName:"p",href:"https://download.hrz.tu-darmstadt.de/pub/FB20/Dekanat/Publikationen/UKP/76500354.pdf"},"BELA"),").\nFor example, most of the pre-LLM approaches that use deep learning require\nhard work ",(0,s.kt)("em",{parentName:"p"},'to teach a model how to "speak" SQL')," using large\ncorpora of tables and (question, query) examples, such as ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1709.00103"},"WikiSQL")," or ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/taoyds/spider"},"Spider"),".\nPeople had to solve and glue-together solutions to many technical problems, such as parsing the question,\nentity detection, synonym finding, string similarity, among others.\nPost-LLM approaches require ",(0,s.kt)("em",{parentName:"p"},"none")," of these efforts because LLMs, such as GPT-4, already speak SQL, Cypher, and SPARQL out of the box, having been exposed to them in their pretraining.\nNowadays, the hard problem now is for developers ",(0,s.kt)("em",{parentName:"p"},"to learn how to prompt LLMs")," so that\nLLMs generate correct queries. I'll say more about this problem. In contrast, building the above pipeline requires much less effort as\nI'll show next."),(0,s.kt)("h3",{id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex"},"Simplicity of Developing RAG Systems: LangChain and LlamaIndex"),(0,s.kt)("p",null,"If you have been following the developments in the LLM space, you will not be surprised to hear that nowadays people build\nQ&A systems that convert ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query language using two common tools:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://www.langchain.com/"},"LangChain"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://www.llamaindex.ai/"},"LlamaIndex"),".\nThe same tools also integrate with the underlying storage system to load and retrieve your data. To make this more concrete, let me review the ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"K\xf9zu-LangChain integration"),", similar to the integrations found in other GDBMSs. You as a programmer have very little to do: you prepare your K\xf9zu\ndatabase ",(0,s.kt)("inlineCode",{parentName:"p"},"db")," and load your data into it, wrap it around a ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," objects in Python and you have\na text-to-Cypher pipeline:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'import kuzu\nfrom langchain.chains import KuzuQAChain\nfrom langchain_community.chat_models import ChatOpenAI\nfrom langchain_community.graphs import KuzuGraph\n\ndb = kuzu.Database("test_db")\n... // create your graph if needed\ngraph = KuzuGraph(db)\nchain = KuzuQAChain.from_llm(ChatOpenAI(temperature=0), graph=graph, verbose=True)\nchain.run("Who played in The Godfather: Part II?")\n')),(0,s.kt)("p",null,"I am following the example application in this ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"documentation"),",\nwhich uses a database of movies, actors, and directors. "),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Output:\n> Entering new chain...\nGenerated Cypher:\nMATCH (p:Person)-[:ActedIn]->(m:Movie {name: 'The Godfather: Part II'}) RETURN p.name\nFull Context:\n[{'p.name': 'Al Pacino'}, {'p.name': 'Robert De Niro'}]\n\n> Finished chain.\n\n'Al Pacino and Robert De Niro both played in The Godfather: Part II.'\n")),(0,s.kt)("p",null,'The "chain" first generated a Cypher query using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nBehind the curtain, i.e., inside the KuzuQAChain code,\na GPT model was given the following prompt:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Generate Cypher statement to query a graph database.\nInstructions:\nUse only the provided relationship types and properties in the schema.\nDo not use any other relationship types or properties that are not provided.\n\nSchema:\nNode properties: [{'properties': [('name', 'STRING')], 'label': 'Movie'}, {'properties': [('name', 'STRING'), ('birthDate', 'STRING')], 'label': 'Person'}]\nRelationships properties: [{'properties': [], 'label': 'ActedIn'}]\nRelationships: ['(:Person)-[:ActedIn]->(:Movie)']\n\nNote: Do not include any explanations or apologies in your responses.\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\nDo not include any text except the generated Cypher statement.\n\nThe question is:\nWho played in The Godfather: Part II?\n")),(0,s.kt)("p",null,"Indeed, if you copy this prompt and paste it in ",(0,s.kt)("a",{parentName:"p",href:"https://chat.openai.com/"},"chatGPT's browser interface"),",\nyou will get the same or very similar Cypher query. The important point is: that's all\nthe coding you have to do to build a natural language interface that can query your database.\nYou ultimately construct a string prompt that contains ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", some\ninstructions, and schema of the database, and the LLM will generate a query for you.\nThe ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," are simple wrappers to do just that.\nIf you want to play around with how well this works on other datasets,\nwe have this pipeline implemented in K\xf9zu's browser frontend ",(0,s.kt)("a",{parentName:"p",href:"https://kuzudb.com/docusaurus/kuzuexplorer/"},"K\xf9zuExplorer"),". "),(0,s.kt)("p",null,'That is, for any database you have in K\xf9zu, you get a natural language interface over it in\nK\xf9zuExplorer (just click the "robot icon" on the left panel).\nYou can develop similar pipelines with other GDBMSs using similar interfaces (',(0,s.kt)("em",{parentName:"p"},"though I recommend using K\xf9zu as it will be the\nsimplest to get started")," \ud83d\ude09: ",(0,s.kt)("em",{parentName:"p"},"Unlike other GDBMSs, K\xf9zu is embeddable and requires no server set up"),").\nIf you instead want to build Q&A systems over your RDBMSs, you can use\nLangChain's ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-2-text-to-sql-query-and-execution"},"SQLDatabaseChain")," and\n",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-3-sql-agents"},"SQLAgent")," or\nLlamaIndex's ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/SQLIndexDemo.html#part-1-text-to-sql-query-engine"},"NLSQLTableQueryEngine"),'. The level of simplicity is similar to the example I presented. In practice, it is unlikely that your chatbot or search engine will be as simple\nas the above example where the application interacts with the LLM only once. If you want\nto interact with the LLM multiple times and conditionally take one action over another action etc.,\nLangChain and LlamaIndex also provide ways to do that through their "Agents" (see ',(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/modules/agents/"},"LangChain Agents")," and ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/use_cases/agents.html"},"Llama Index Agents"),")."),(0,s.kt)("h3",{id:"how-good-are-llms-in-generating-high-level-queries"},"How Good Are LLMs in Generating High-Level Queries?"),(0,s.kt)("p",null,"Although building a text-to-high-level-query-language pipeline is now very simple with LLMs,\nsimplicity ",(0,s.kt)("strong",{parentName:"p"},"does not")," mean quality. Indeed, people building these systems are now faced with the following two important questions: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How accurate are the high-level queries that LLMs generate?")),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How, e.g., through what types of prompts or data modeling, can we increase the accuracy of the\nqueries generated by LLMs?"))),(0,s.kt)("p",null,"Here are several papers on this that I suggest reading:"),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2303.13547.pdf"},"A comprehensive evaluation of ChatGPT\u2019s zero-shot Text-to-SQL capability"))," from Tsinghua University and University of Illinois at Chicago. "),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2204.00498.pdf"},"Evaluating the Text-to-SQL Capabilities of Large Language Models"))," from researchers from Cambridge and universities and institutes from Montr\xe9al."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2308.15363.pdf"},"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation"))," from Alibaba Group."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.12586.pdf"},"Enhancing Few-shot Text-to-SQL Capabilities of Large Language Models: A Study on Prompt Design Strategies"))," from Yale, Columbia, and Allen Institute for AI."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.11853.pdf"},"How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings"))," from Ohio State"),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2311.07509.pdf"},"A Benchmark to Understand the Role of Knowledge Graphs on LLM's Accuracy for Q&A on Enterprise SQL Databases"))," from data.world.")),(0,s.kt)("p",null,"These papers are either entirely or ",(0,s.kt)("em",{parentName:"p"},"almost")," entirely evaluation-only papers that experiment with very detailed approaches of prompting LLMs\nto generate SQL queries. First, let me say that the general message these\npapers give (maybe except the last one) is that LLMs are pretty good. With right prompting (or even with basic prompting)\nthey do very well on these benchmarks. I see accuracy rates over 85% on the Spider benchmark in several papers. These are clearly\nbetter numbers than what pre-LLM state-of-the-art systems achieved. This should be impressive to many."),(0,s.kt)("p",null,'Second, the set of techniques are too detailed to cover here but some example heuristics\nthese papers experiment with include the following: (i) the syntax used for providing the schema\n(apparently putting "the pound sign ',(0,s.kt)("inlineCode",{parentName:"p"},"#")," to differentiate prompt from response in examples yields impressive performance gains\" \ud83d\ude00 go figure); (ii)\nthe number and selection of example (question, SQL) pairs, e.g., apparently there is a sweet spot in the number\nof examples to provide; or (iii) the effects of standardizing the text in the prompt, e.g., indenting and using all lower case letters consistently\n(apparently has minor but some effect). Yes, as interesting and important it is to learn how to use LLMs better, I still\ncan't escape the following thought before going to bed: somewhere out there, some advisor might be torturing some graduate student\nto check if the magical box produces better SQL with a pound sign vs double slashes!"),(0,s.kt)("p",null,"Most work I found is on generating SQL.\nIn contrast, I found no papers that do similar prompting study for query languages\nof GDBMS though I ran into two papers that are providing benchmarks for query languages of GDBMSs:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2309.16248"},"SPARQL"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://dl.acm.org/doi/pdf/10.1145/3511808.3557703"},"Cypher"),").\nSo a low-hanging fruit future work is the following:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 1: Similar prompting studies for query languages of graph DBMSs with a focus on recursive and unions of joins queries."),":\nIn contrast to SQL queries, here, one should study various recursive queries that the query languages of GDBMSs are particularly good\nat and union-of-join queries which are asked by omitting labels in the query languages of GDBMSs.\nFor example if you want to ask all connections between\nyour ",(0,s.kt)("inlineCode",{parentName:"p"},"User")," nodes and User can have many relationships, such as ",(0,s.kt)("inlineCode",{parentName:"p"},"Follows"),", ",(0,s.kt)("inlineCode",{parentName:"p"},"SentMoneyTo"),", or ",(0,s.kt)("inlineCode",{parentName:"p"},"SameFamily"),",\nyou would have to write 3 possible join queries in SQL and union them. Instead, you can write this query\nwith a very simple syntax in Cypher as\n",(0,s.kt)("inlineCode",{parentName:"p"},"MATCH (a:User)-[e]->(b:User)"),", where the omissions of the label on the relationship ",(0,s.kt)("inlineCode",{parentName:"p"},"e")," indicates searching over\nall possible joins.",(0,s.kt)("sup",{parentName:"p",id:"fnref-1-a733ad"},(0,s.kt)("a",{parentName:"sup",href:"#fn-1-a733ad",className:"footnote-ref"},"1"))," "),(0,s.kt)("p",null,"As a side note: In the context of any query language, including SQL, questions that require sub-queries are of particular\ninterest as they are generally harder to write. Some of the papers I read had sections analyzing the performance of\nLLMs on nested queries but the focus was not on these. In prior literature there are papers written solely on text-to-SQL generation for\nnested queries (e.g., see ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol13/p2747-sen.pdf"},"the ATHENA++ paper"),"). I am certain someone\nsomewhere is already focusing solely on nested queries and that's a good idea."),(0,s.kt)("h2",{id:"dataworld-paper-and-some-interesting-questions"},"data.world Paper and Some Interesting Questions"),(0,s.kt)("p",null,"In the remainder of the post I want to review ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2311.07509.pdf"},"the benchmark paper")," from ",(0,s.kt)("inlineCode",{parentName:"p"},"data.world")," that focuses on text-to-SQL using LLMs. Unlike other papers out there that\nstudy the effects of different prompting heuristics, this paper studies the ",(0,s.kt)("em",{parentName:"p"},"effects of data modeling\non the accuracy of SQL queries generated by LLMs"),", which is closely related to GDBMSs. "),(0,s.kt)("p",null,"Specifically, this paper is an evaluation of the performance of GPT-4 in generating SQL using no examples, i.e., zero-shot,\nwith basic prompting over a standardized insurance database schema\ncalled The ",(0,s.kt)("a",{parentName:"p",href:"https://www.omg.org/spec/PC/1.0/About-PC"},"OMG Property and Casualty Data Model"),".\nSee Figure 1 in the paper (omitted here) for the conceptual schema, which consists of classes such as\nPolicy, Account, Claims, Insurable Object, among others, and their relationships.\nThe paper has a benchmark of 43 natural language questions and compares 2 approaches to generate the SQL query.\nThe below figure shows an overview of these approaches for reference:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:p.Z,width:"600"})),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},"Direct SQL Generation: In this approach, ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," and the relational schema of the OMG database is given\nto GPT-4. The schema is given in terms of ",(0,s.kt)("inlineCode",{parentName:"li"},"CREATE TABLE")," statements, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre",className:"language-sql"},"CREATE TABLE Claim(\nClaim_Identifier int NOT NULL,\nCatastrophe_Identifier int NULL,\n...\nClaim_Open_Date datetime NULL ,\n ...\n PRIMARY KEY (Claim_Identifier ASC),\n FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier),\n...)\n")),"The full schema statements can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"here"),".\nGPT-4 is asked to generate a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to answer ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nCopy-pasted from the paper, these prompts look as follows:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the database described by the following DDL:\n\nWrite a SQL query that answers the following question. Do not explain the query. return just the query, so it can be run\nverbatim from your response.\nHere\u2019s the question:\n\n"))),(0,s.kt)("li",{parentName:"ol"},"Indirect SQL Generation via Graph Modeling/SPARQL: In this approach, instead of the relational schema of the database, the same\ndatabase is modeled as an ",(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://www.w3.org/OWL/"},"OWL ontology"))," (OWL is short for Web Ontology Language).\nOntology is another term for schema when modeling data as graph as classes and relationships between them. OWL is a W3C standard\nand part of the RDF technology stack so OWL ontologies are expressed as a set RDF triples, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},'...\nin:Claim rdf:type owl:Class ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim" .\nin:claimOpenDate rdf:type owl:DatatypeProperty ;\n rdfs:domain in:Claim ;\n rdfs:range xsd:dateTime ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim Open Date" .\nin:hasCatastrophe rdf:type owl:ObjectProperty ;\n rdfs:domain in:Claim ;\n rdfs:range in:Catastrophe ;\n rdfs:isDefinedBy ;\n rdfs:label "has catastrophe" .\n...\n')),"The full ontology can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"here"),".\nGPT-4 is then asked to generate a SPARQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", instead of SQL, for the same ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". The full prompt, again copy-pasted\nfrom the paper with some simplifications, looks like this:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the OWL model described in the following TTL file:\n\nWrite a SPARQL query that answers the question. Do not explain the query. return just the query, so it can be run verbatim from your response.\nHere\u2019s the question:\n\n")),"As a last step, the authors have a direct mapping from ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". This is a quite straigh-forward step\nas the modeling as an ontology vs relational schema have direct translations from classes and properties to tables and columns.")),(0,s.kt)("p",null,"An interesting comparison. There is some intuition for why one would be interested in the effectiveness of\nquery generation through an ontology because one of the well-known\npre-LLM text-to-SQL papers ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA")," did something similar.\nInstead of SPARQL they had another query language over an ontology called Ontology Query Language, which\nwas then mapped to SQL. "),(0,s.kt)("p",null,"The results are even more interesting. The authors categorize their 43 questions into\n4 quadrants based on 2 dimensions: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"question")," complexity: Questions that require only simple projections\nare low complexity. Those that require aggregations or math functions are high complexity."),(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"schema")," complexity: Questions whose SQL queries require up to 4 tables are low schema complexity. Those that\nrequire 5 or more joins are high schema complexity. ")),(0,s.kt)("p",null,'The accuracy results are shown below. Accuracy here is "execution accuracy" meaning that only the answers of the queries\nare checked against the ground truth answer. That is, even if the SQL query GPT-4 generated was actually not correct\nbut by luck it computed the correct answers the paper takes it as correct (apparently happens very rarely in this study).'),(0,s.kt)("table",null,(0,s.kt)("thead",{parentName:"table"},(0,s.kt)("tr",{parentName:"thead"},(0,s.kt)("th",{parentName:"tr",align:null},"Overall: 16.7% vs 54.2%"),(0,s.kt)("th",{parentName:"tr",align:null},"Low Schema Complexity"),(0,s.kt)("th",{parentName:"tr",align:null},"High Schema Complexity"))),(0,s.kt)("tbody",{parentName:"table"},(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"Low Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"37.4% vs 66.9%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 38.7%")),(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"High Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"25.5% vs 71.1%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 35.7%")))),(0,s.kt)("p",null,"Overall, the indirect SQL generation method through SPARQL is much more effective in this zero-shot setting.\nNot surprisingly, questions that require 5 or more joins are harder regardless of the\nmethod used and direct SQL cannot get any of those questions right. These are interesting\nresults for an initial study on the effects of data modeling on LLMs' accuracy on generating database queries.\nThese results should give many researchers and practitioners ideas about how to replicate\nand validate/invalidate similar results under different settings, e.g., with few-shot\nexamples and under different databases."),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"That said, one should ask, why?")," In fact, we should all be suspicious that merely modeling the\nsame set of records with a different abstraction should have any visible effects. After all, by modeling\nthe same records differently, one does not obtain or lose information. So if and when LLMs are smart enough,\nthey shouldn't care how the data was modeled. But for now, if a pound sign can make a difference,\nwe should not be surprised modeling choices can have large impacts. As such, it is healthy to be suspicious\nand ask why. These motivate a few important questions I think are worth studying. My premise\nis that somehow if the differences are this large, it must be that the task for GPT-4 got simpler when\nasked to generate a SPARQL query. I can hypothesize about a few possible reasons for this: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("p",{parentName:"li"},(0,s.kt)("em",{parentName:"p"},"Some queries require fewer tokens to write in SPARQL"),": One difference the query languages\nof GDBMSs often have is that certain equality conditions are implicit in the syntax, which\nmeans their ",(0,s.kt)("inlineCode",{parentName:"p"},"WHERE")," clauses are simpler for some queries. For example if you wanted to return\nthe names of the Catastrophe that Claim with ID Claim1 has, in SPARQL you can write it as:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?name\nWHERE { in:hasCatastrophe ?catastrophe,\n ?catastophe in:catastropheName ?name}\n")),(0,s.kt)("p",{parentName:"li"},"In SQL you would write:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Catastrophe_Name\nFROM Claim, Catastrophe\nWHERE Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier AND\n Claim.Claim_Identifier = Claim1\n")),(0,s.kt)("p",{parentName:"li"},"Note that the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Claim_Identifier = Claim1")," equality condition is implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"}," in:hasCatastrophe ?catastrophe")," triple\nand the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier")," condition is implicit in the fact that ",(0,s.kt)("inlineCode",{parentName:"p"},"?catastrophe")," appears\nboth in the first and second triples in the SPARQL query. Such implicit equality conditions are common in the languages of\ngraph query languages especially when expressing joins. For example in Cypher you can omit all join conditions in WHERE clauses as long\nas those joins have been pre-defined to the system as relationships. Instead you join records through the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," syntax.\nIt's unclear how much this could matter but it is an immediate advantage of SPARQL that can explain why complex join queries are easier to generate\nin SPARQL than SQL. "),(0,s.kt)("p",{parentName:"li"},(0,s.kt)("strong",{parentName:"p"},"Side note"),": On the flip side, SPARQL can be more verbose in projections. For example, if you wanted to return the number, open and close\ndates of every claim, you'd write the following SQL query:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Claim_Number, Claim_Open_Date, Claim_Close_Date\nFROM Claim\n")),(0,s.kt)("p",{parentName:"li"},"In SPARQL, you'd have to write both the names of the property you want to project and give it an additional variable as follows:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?number, ?open_date, ?close_date\nWHERE { ?claim in:claimNumber ?number,\n ?claim in:claimOpenDate ?open_date,\n ?claim in:claimCloseDate ?close_date\n")))),(0,s.kt)("ol",{start:2},(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Graph modeling gives explicit names to foreign keys:")," There is a reason that database courses teach database modeling to students\nusing graph-based models, such as Entity-Relationship or UML models. First, humans think of the world\nas objects/entities and their relationships. In some sense, these are higher-level models where relationships\nbetween objects are denoted explicitly with explicit names (instead of as less explicit foreign key constraints).\nFor example, the implicit connection between Claims and\nCatastrophes through the ",(0,s.kt)("inlineCode",{parentName:"li"},"FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier)"),"\nconstraint was given an explicit English name: ",(0,s.kt)("inlineCode",{parentName:"li"},"hasCatastrophe")," in the ontology. This explicitness may make\nit easier for LLMs to understand the schema and generate SPARQL queries.")),(0,s.kt)("p",null,"Both of these are qualitative hypotheses. however, there is a more immediate\nreason the authors of this paper may have obtained such major differences between the two approaches they tried.\nIntentionally or unintentionally, their ontology is simplified significantly compared to the relational schema they have.\nFor example, the Claim relation has ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Reopen_Date")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Status_Code")," properties which are removed from the ontology.\nMany such properties from the relations seem to have been removed, and the ontology overall looks simpler.\nThere are also several differences between the ontology and the relational schema that are confusing. For example\nthe ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"ontology"),"\nhas a class ",(0,s.kt)("inlineCode",{parentName:"p"},"Agent")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Policy")," objects are ",(0,s.kt)("inlineCode",{parentName:"p"},"in:soldByAgent")," by some Agent objects (see lines 20 and 92). I cannot\nsee corresponding relations or columns in the ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"relational schema"),". Unless I am missing something about how the prompts were given,\nthese are also likely to have important effects on the results and someone should fix and obtain new results\nin a more fair comparison."),(0,s.kt)("p",null,"Let me next raise several high-level questions that I think are important:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2: Rules of thumbs in data modeling to make LLM-generated queries more accurate."),"\nI think the higher-level question of studying the effects of data modeling in more depth is a very good direction.\nAs LLMs get smarter, I would expect that the presence/absence of a pound sign or the style of English\nshould matter less. These look more like syntactic differences that can be automatically detected over time.\nModeling choices are more fundamental and relate to the clarity and understandibility of the records that will be queried by the LLM.\nSo identifying some rules of thumb here looks like the promising path forward. Let me list a few immediate questions one can study:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.1: Effects of normalization/denormalization.")," If the shortcoming of GPT-4 is\ngenerating queries with many joins, one way to solve this is to denormalize the relations into fewer\ntables and study its effects. Again, I'm thinking of same records just modeled differently with fewer\ntables. What happens if we reduce all data into a single table with dozens of columns and many value repetitions?\nNow all possible joins would have been performed so we'd force the LLM to write a join-less query with\nfilters, distincts, and aggregations. What happens if we normalize the tables step-by-step until we\nget to a well known form, such as ",(0,s.kt)("a",{parentName:"p",href:"https://en.wikipedia.org/wiki/Boyce%E2%80%93Codd_normal_form"},"Boyce-Codd Normal Form"),"? Do we consistently get better or worse accuracy?"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.2: Use of views.")," In relational modeling, views are an effective way to have higher\nand simpler modeling of your records. Similar to a ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline,\none can test the effectiveness of ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"o"),(0,s.kt)("mi",{parentName:"mrow"},"v"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"r"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"V"),(0,s.kt)("mi",{parentName:"mrow"},"i"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"w"),(0,s.kt)("mi",{parentName:"mrow"},"s")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL-over-Views}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"o"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.03588em"}},"v"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02778em"}},"er"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"Vi"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"e"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02691em"}},"w"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"s"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline."),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 3: Use of Cypher as intermediate query language to translate to SQL.")," One reason to experiment with Cypher\nin addition to SPARQL is that Cypher is, arguably, more similar to SQL than SPARQL but has the advantage that (common) join\nconditions are implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," node-arrow syntax. Yet Cypher does not have the verbosity of the SPARQL projections\nI mentioned above (so you project properties the same way you project columns in SQL). In my world, all high-level query languages\nlook very similar to SQL, so eventually when LLMs are smart enough, or even today, I think these language differences\nshould have minor effects. However, graph query languages will likely continue to have major advantages when writing\nrecursive queries, as they have specialized syntax (e.g., Cypher has the Kleene star syntax) to do so. For those queries,\nexpressing first in Cypher and then mapping to SQL could lead to an advantage. "),(0,s.kt)("h2",{id:"final-words"},"Final Words"),(0,s.kt)("p",null,"Needless to say, in the next few years, the field will be flooded with work on how to\nuse LLMs to solve the text-to-high-level-query problem. Many rules of thumb will emerge\nabout how to prompt them correctly. The questions one can ask in this space is endless.\nI can speculate about it a lot, but I think it's plausible that\nmany of these rules of thumb, specifically the syntactic\ndifferences in prompting, can become\nobsolete very quickly as newer and more advanced LLMs that are better at speaking high-level database languages emerge.\nFor example, it's plausible that people will stop showing LLMs example (question, query) pairs each time they ask them to generate\nSQL once LLMs are better at speaking SQL."),(0,s.kt)("p",null,"However, the harder question of how to model the data so that its meaning is clear, and the\nqueries that need to be written, are simpler, is more likely to remain a challenge for a longer time. I would not be too optimistic\nthat there can emerge very clear answers to this question. How to model your data is part-art and part-science.\nYet, some studiable questions, such as the effects of normalization, use of views or generating Cypher for recursive queries,\ncan yield some important best practices that can be useful to developers building these systems."),(0,s.kt)("p",null,"In the next post, I will cover what I learned about RAG over unstructured data. Graphs and knowledge graphs are playing\na more interesting role in that space. Until then, happy new year to all!"),(0,s.kt)("div",{className:"footnotes"},(0,s.kt)("hr",{parentName:"div"}),(0,s.kt)("ol",{parentName:"div"},(0,s.kt)("li",{parentName:"ol",id:"fn-1-a733ad"},"SPARQL syntax is different but a similar advantage exists by omitting type constraints.",(0,s.kt)("a",{parentName:"li",href:"#fnref-1-a733ad",className:"footnote-backref"},"\u21a9")))))}d.isMDXComponent=!0},6477:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/qa-over-enterprise-data-c63fb036791e4a0ec1f24d802d50254e.png"},2045:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/rag-using-structured-data-4e7c4e780c6a85b5664a3aa3f3f6c5a9.png"},4287:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/two-sql-generation-approaches-cceca4532513a6bb7ccae29e3f3ca94f.png"}}]); \ No newline at end of file diff --git a/docusaurus/assets/js/beadeff4.2e3dfba8.js b/docusaurus/assets/js/beadeff4.2e3dfba8.js new file mode 100644 index 000000000..861b6745d --- /dev/null +++ b/docusaurus/assets/js/beadeff4.2e3dfba8.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkkuzu_docs=self.webpackChunkkuzu_docs||[]).push([[3943],{3905:(a,e,t)=>{t.d(e,{Zo:()=>l,kt:()=>k});var n=t(7294);function s(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function m(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,n)}return t}function r(a){for(var e=1;e=0||(s[t]=a[t]);return s}(a,e);if(Object.getOwnPropertySymbols){var m=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(s[t]=a[t])}return s}var i=n.createContext({}),o=function(a){var e=n.useContext(i),t=e;return a&&(t="function"==typeof a?a(e):r(r({},e),a)),t},l=function(a){var e=o(a.components);return n.createElement(i.Provider,{value:e},a.children)},h="mdxType",N={inlineCode:"code",wrapper:function(a){var e=a.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(a,e){var t=a.components,s=a.mdxType,m=a.originalType,i=a.parentName,l=p(a,["components","mdxType","originalType","parentName"]),h=o(t),c=s,k=h["".concat(i,".").concat(c)]||h[c]||N[c]||m;return t?n.createElement(k,r(r({ref:e},l),{},{components:t})):n.createElement(k,r({ref:e},l))}));function k(a,e){var t=arguments,s=e&&e.mdxType;if("string"==typeof a||s){var m=t.length,r=new Array(m);r[0]=c;var p={};for(var i in e)hasOwnProperty.call(e,i)&&(p[i]=e[i]);p.originalType=a,p[h]="string"==typeof a?a:s,r[1]=p;for(var o=2;o{t.r(e),t.d(e,{assets:()=>h,contentTitle:()=>o,default:()=>d,frontMatter:()=>i,metadata:()=>l,toc:()=>N});var n=t(7462),s=(t(7294),t(3905)),m=t(6477),r=t(2045),p=t(4287);const i={slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},o="RAG Using Structured Data: Overview & Important Questions",l={permalink:"/docusaurus/blog/llms-graphs-part-1",source:"@site/blog/2024-01-04-llms-graphs-part-1/index.md",title:"RAG Using Structured Data: Overview & Important Questions",description:"During the holiday season, I did some reading on",date:"2024-01-04T00:00:00.000Z",formattedDate:"January 4, 2024",tags:[{label:"use-case",permalink:"/docusaurus/blog/tags/use-case"}],readingTime:24.84,hasTruncateMarker:!1,authors:[{name:"Semih Saliho\u011flu",title:"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo",url:"https://cs.uwaterloo.ca/~ssalihog/",imageURL:"https://kuzudb.com/img/blog/semih.jpg",key:"semih"}],frontMatter:{slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},nextItem:{title:"K\xf9zu 0.1.0 Release",permalink:"/docusaurus/blog/kuzu-0.1.0-release"}},h={authorsImageUrls:[void 0]},N=[{value:"Killer App: Retrieval Augmented Generation",id:"killer-app-retrieval-augmented-generation",level:2},{value:"A note on the term RAG",id:"a-note-on-the-term-rag",level:3},{value:"RAG Using Structured Data: Text-to-High-level-Query",id:"rag-using-structured-data-text-to-high-level-query",level:2},{value:"Overview",id:"overview",level:3},{value:"Simplicity of Developing RAG Systems: LangChain and LlamaIndex",id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex",level:3},{value:"How Good Are LLMs in Generating High-Level Queries?",id:"how-good-are-llms-in-generating-high-level-queries",level:3},{value:"data.world Paper and Some Interesting Questions",id:"dataworld-paper-and-some-interesting-questions",level:2},{value:"Final Words",id:"final-words",level:2}],c={toc:N},k="wrapper";function d(a){let{components:e,...t}=a;return(0,s.kt)(k,(0,n.Z)({},c,t,{components:e,mdxType:"MDXLayout"}),(0,s.kt)("p",null,"During the holiday season, I did some reading on\nLLMs and specifically on the techniques that use LLMs together with graph databases and knowledge graphs.\nIf you are new to the area like me, the amount of activity on this topic on social\nmedia as well as in research publications may have intimidated you.\nIf so, you're exactly my target audience for this new blog post series I am starting.\nMy goals are two-fold: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview the area"),": I want to present what I learned with a simple and consistent terminology and at\na more technical depth than you might find in other blog posts. I am aiming a depth similar to what I aim when preparing\na lecture. I will link to many quality and technically satisfying pieces of content (mainly papers since the area is very researchy)."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview important future work"),": I want to cover several important future works in the space. I don't\nnecessarily mean work for research contributions but also simple approaches to experiment with if you are\nbuilding question answering (Q&A) applications using LLMs and graph technology.")),(0,s.kt)("p",null,"This post covers the topic of retrieval augmented generation (RAG) using structured data. Then, in a follow up post,\nI will cover RAG using unstructured data, where\nI will also mention a few ways people are building RAG-based Q&A\xa0systems that use both structured and unstructured data."),(0,s.kt)("admonition",{title:"TL;DR: The key takeaways from this post are:",type:"tip"},(0,s.kt)("ul",{parentName:"admonition"},(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"RAG overview"),": RAG is a technique to fill the knowledge gap of LLMs using private data. RAG systems\nuse private structured records stored in a database and/or unstructured data in text files. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Impressive simplicity and effectiveness of developing a natural language interface over your database using LLMs"),": In the pre-LLM era, the amount of engineering effort\nto develop a pipeline that delivered a natural language interface over your database was ",(0,s.kt)("em",{parentName:"li"},"immense"),". The\nhard problem was to teach a model to ",(0,s.kt)("em",{parentName:"li"},"speak"),' SQL, Cypher, or SPARQL.\nThis contrasts sharply with the simplicity of developing similar pipelines now because LLMs already "speak" these languages.\nThe hard task now is for ',(0,s.kt)("em",{parentName:"li"},"developers to learn how to prompt LLMs")," to get correct database queries. Furthermore, there is\nevidence that LLMs, if prompted correctly, will generate a decent proportion of queries with impressive accuracy. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Lack of work that studies LLMs' ability to generate Cypher or SPARQL:")," Most technically-deep work on understanding\nLLMs' ability to generate accurate high-level query languages is on SQL. We need more\nwork understanding the behavior of LLMs on the query languages of GDBMSs (like Cypher or SPARQL), specifically on recursive and union-of-join queries."),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Studying the effects of data modeling (normalization, views, graph modeling) on the accuracy of LLM-generated queries is important:"),"\nMany people are studying heuristics for prompting LLMs to increase their efficiency focusing on the syntax and the structure of providing\nthe schema and selection of examples in the prompt. An important and under-studied\nproblem is the effects of data modeling choices on the accuracy of the queries generated by LLMs. I point to ",(0,s.kt)("a",{parentName:"li",href:"https://arxiv.org/pdf/2311.07509.pdf"},"one interesting paper")," in this space and raise several questions related to\nnormalizations and use of views in relational modeling and comparisons with graph modeling approaches. "))),(0,s.kt)("h2",{id:"killer-app-retrieval-augmented-generation"},"Killer App: Retrieval Augmented Generation"),(0,s.kt)("p",null,"Let's review the killer application of LLMs in enterprises.\nThe application is ultimately Q&A over private enterprise data. Think of a chatbot to which you\ncan ask natural language questions (",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),'), such as: "Who is our top paying customer from Waterloo?",\nor "What are data privacy regulations in Canada we need to comply with?"\nand get back natural language answers (',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),").\nLLMs, out of the box, cannot answer these questions because they have a ",(0,s.kt)("em",{parentName:"p"},"knowledge gap"),".\nFor example, LLMs never had any access to your sales records when they were trained.\nTherefore, they need to retrieve or be provided with\nextra information from private data sources of the enterprise."),(0,s.kt)("h3",{id:"a-note-on-the-term-rag"},"A note on the term RAG"),(0,s.kt)("p",null,"There seems to be tremendous interest in building systems that combine a traditional\ninformation retrieval component, e.g., one that looks up some documents from\nan index, with a natural language generator component, such as an LLM. The term for such systems is\n",(0,s.kt)("em",{parentName:"p"},"Retrieval Augmented Generation")," (RAG).\nThe term is coined in ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2005.11401.pdf"},"this paper"),' to refer\nto the method of fine-tuning an LLM with additional information, i.e.,\nusing this additional data to train a new variant of the LLM.\nThe original usage form in the paper is "RAG models". Nowadays it is used in a variety of ways,\nsuch as, "RAG system", "RAG-based system", "RAG does X", or\n"Building RAG with Y". RAG often does not refer to fine-tuning LLMs any more. Instead, it\nrefers to providing LLMs with private data along with the question to fix the knowledge gap.\nEven systems that simply use an LLM to convert a\n',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL or Cypher query and simply return the results of the query\nare called "RAG systems" in some documentations. I will use the term in this broader sense.'),(0,s.kt)("p",null,"You can build RAG-based Q&A systems by using structured and/or unstructured\ndata. The high-level views of these systems look like this:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:m.Z,width:"600"})),(0,s.kt)("h2",{id:"rag-using-structured-data-text-to-high-level-query"},"RAG Using Structured Data: Text-to-High-level-Query"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},'Note: If you are familiar with how to develop RAG systems with LangChain and LlamaIndex, you can directly skip\nto the "',(0,s.kt)("a",{parentName:"em",href:"#how-good-are-llms-in-generating-high-level-queries"},"How Good are LLMs in Generating High-level Queries"),'" part that\nreflects on the reading I did on RAG using structured data.')),(0,s.kt)("h3",{id:"overview"},"Overview"),(0,s.kt)("p",null,"Many blog posts and several papers concern Q&A systems that simply convert\n",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query languge, such as SQL, Cypher, or SPARQL, using an LLM.\nThe figure below describes the overall approach:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:r.Z,width:"600"})),(0,s.kt)("p",null,(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),', the schema of a database, and optionally\nsome example natural language question and high-level query examples, are given\nto the LLM as a prompt.\nThe terms "no shot", "one shot", or "few shot" refer to the number of examples provided\nin the prompt. Depending on the underlying database, the schema may contain\ncolumns of relational tables and their descriptions, or labels of nodes and edges\nof a graph database. Using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", the database schema, and optionally\nsome examples, the LLM generates\na database query, such as SQL or Cypher. The system runs this query against the\nDBMS and returns back the query result or using the LLM again, converts\nthe query result back to a natural language answer ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". "),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"Let us pause here to appreciate one thing:")," For many decades, the database community has studied the problem\nof converting ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL (aka "text-to-SQL"). Here is a good recent ',(0,s.kt)("a",{parentName:"p",href:"https://link.springer.com/article/10.1007/s00778-022-00776-8"},"survey paper"),"\nthat covers only the deep network-based approaches and ",(0,s.kt)("a",{parentName:"p",href:"https://www.nowpublishers.com/article/Details/DBS-078"},"a more extensive survey/book"),"\non the broader topic of natural language interfaces to databases.\nNeither of these surveys cover any work that directly uses LLMs such as GPT models,\nwhich are quite recent developments. Take any of the work covered in these surveys and\nyou'll find an approach that requires significant engineering to build the pipeline shown in the above figure.\nThere exist several pre-LLM text-to-SQL systems (e.g., ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA"),"\nor ",(0,s.kt)("a",{parentName:"p",href:"https://download.hrz.tu-darmstadt.de/pub/FB20/Dekanat/Publikationen/UKP/76500354.pdf"},"BELA"),").\nFor example, most of the pre-LLM approaches that use deep learning require\nhard work ",(0,s.kt)("em",{parentName:"p"},'to teach a model how to "speak" SQL')," using large\ncorpora of tables and (question, query) examples, such as ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1709.00103"},"WikiSQL")," or ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/taoyds/spider"},"Spider"),".\nPeople had to solve and glue-together solutions to many technical problems, such as parsing the question,\nentity detection, synonym finding, string similarity, among others.\nPost-LLM approaches require ",(0,s.kt)("em",{parentName:"p"},"none")," of these efforts because LLMs, such as GPT-4, already speak SQL, Cypher, and SPARQL out of the box, having been exposed to them in their pretraining.\nNowadays, the hard problem now is for developers ",(0,s.kt)("em",{parentName:"p"},"to learn how to prompt LLMs")," so that\nLLMs generate correct queries. I'll say more about this problem. In contrast, building the above pipeline requires much less effort as\nI'll show next."),(0,s.kt)("h3",{id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex"},"Simplicity of Developing RAG Systems: LangChain and LlamaIndex"),(0,s.kt)("p",null,"If you have been following the developments in the LLM space, you will not be surprised to hear that nowadays people build\nQ&A systems that convert ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query language using two common tools:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://www.langchain.com/"},"LangChain"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://www.llamaindex.ai/"},"LlamaIndex"),".\nThe same tools also integrate with the underlying storage system to load and retrieve your data. To make this more concrete, let me review the ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"K\xf9zu-LangChain integration"),", similar to the integrations found in other GDBMSs. You as a programmer have very little to do: you prepare your K\xf9zu\ndatabase ",(0,s.kt)("inlineCode",{parentName:"p"},"db")," and load your data into it, wrap it around a ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," objects in Python and you have\na text-to-Cypher pipeline:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'import kuzu\nfrom langchain.chains import KuzuQAChain\nfrom langchain_community.chat_models import ChatOpenAI\nfrom langchain_community.graphs import KuzuGraph\n\ndb = kuzu.Database("test_db")\n... // create your graph if needed\ngraph = KuzuGraph(db)\nchain = KuzuQAChain.from_llm(ChatOpenAI(temperature=0), graph=graph, verbose=True)\nchain.run("Who played in The Godfather: Part II?")\n')),(0,s.kt)("p",null,"I am following the example application in this ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"documentation"),",\nwhich uses a database of movies, actors, and directors. "),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Output:\n> Entering new chain...\nGenerated Cypher:\nMATCH (p:Person)-[:ActedIn]->(m:Movie {name: 'The Godfather: Part II'}) RETURN p.name\nFull Context:\n[{'p.name': 'Al Pacino'}, {'p.name': 'Robert De Niro'}]\n\n> Finished chain.\n\n'Al Pacino and Robert De Niro both played in The Godfather: Part II.'\n")),(0,s.kt)("p",null,'The "chain" first generated a Cypher query using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nBehind the curtain, i.e., inside the KuzuQAChain code,\na GPT model was given the following prompt:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Generate Cypher statement to query a graph database.\nInstructions:\nUse only the provided relationship types and properties in the schema.\nDo not use any other relationship types or properties that are not provided.\n\nSchema:\nNode properties: [{'properties': [('name', 'STRING')], 'label': 'Movie'}, {'properties': [('name', 'STRING'), ('birthDate', 'STRING')], 'label': 'Person'}]\nRelationships properties: [{'properties': [], 'label': 'ActedIn'}]\nRelationships: ['(:Person)-[:ActedIn]->(:Movie)']\n\nNote: Do not include any explanations or apologies in your responses.\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\nDo not include any text except the generated Cypher statement.\n\nThe question is:\nWho played in The Godfather: Part II?\n")),(0,s.kt)("p",null,"Indeed, if you copy this prompt and paste it in ",(0,s.kt)("a",{parentName:"p",href:"https://chat.openai.com/"},"chatGPT's browser interface"),",\nyou will get the same or very similar Cypher query. The important point is: that's all\nthe coding you have to do to build a natural language interface that can query your database.\nYou ultimately construct a string prompt that contains ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", some\ninstructions, and schema of the database, and the LLM will generate a query for you.\nThe ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," are simple wrappers to do just that.\nIf you want to play around with how well this works on other datasets,\nwe have this pipeline implemented in K\xf9zu's browser frontend ",(0,s.kt)("a",{parentName:"p",href:"https://kuzudb.com/docusaurus/kuzuexplorer/"},"K\xf9zuExplorer"),". "),(0,s.kt)("p",null,'That is, for any database you have in K\xf9zu, you get a natural language interface over it in\nK\xf9zuExplorer (just click the "robot icon" on the left panel).\nYou can develop similar pipelines with other GDBMSs using similar interfaces (',(0,s.kt)("em",{parentName:"p"},"though I recommend using K\xf9zu as it will be the\nsimplest to get started")," \ud83d\ude09: ",(0,s.kt)("em",{parentName:"p"},"Unlike other GDBMSs, K\xf9zu is embeddable and requires no server set up"),").\nIf you instead want to build Q&A systems over your RDBMSs, you can use\nLangChain's ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-2-text-to-sql-query-and-execution"},"SQLDatabaseChain")," and\n",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-3-sql-agents"},"SQLAgent")," or\nLlamaIndex's ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/SQLIndexDemo.html#part-1-text-to-sql-query-engine"},"NLSQLTableQueryEngine"),'. The level of simplicity is similar to the example I presented. In practice, it is unlikely that your chatbot or search engine will be as simple\nas the above example where the application interacts with the LLM only once. If you want\nto interact with the LLM multiple times and conditionally take one action over another action etc.,\nLangChain and LlamaIndex also provide ways to do that through their "Agents" (see ',(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/modules/agents/"},"LangChain Agents")," and ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/use_cases/agents.html"},"Llama Index Agents"),")."),(0,s.kt)("h3",{id:"how-good-are-llms-in-generating-high-level-queries"},"How Good Are LLMs in Generating High-Level Queries?"),(0,s.kt)("p",null,"Although building a text-to-high-level-query-language pipeline is now very simple with LLMs,\nsimplicity ",(0,s.kt)("strong",{parentName:"p"},"does not")," mean quality. Indeed, people building these systems are now faced with the following two important questions: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How accurate are the high-level queries that LLMs generate?")),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How, e.g., through what types of prompts or data modeling, can we increase the accuracy of the\nqueries generated by LLMs?"))),(0,s.kt)("p",null,"Here are several papers on this that I suggest reading:"),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2303.13547.pdf"},"A comprehensive evaluation of ChatGPT\u2019s zero-shot Text-to-SQL capability"))," from Tsinghua University and University of Illinois at Chicago. "),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2204.00498.pdf"},"Evaluating the Text-to-SQL Capabilities of Large Language Models"))," from researchers from Cambridge and universities and institutes from Montr\xe9al."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2308.15363.pdf"},"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation"))," from Alibaba Group."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.12586.pdf"},"Enhancing Few-shot Text-to-SQL Capabilities of Large Language Models: A Study on Prompt Design Strategies"))," from Yale, Columbia, and Allen Institute for AI."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.11853.pdf"},"How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings"))," from Ohio State"),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2311.07509.pdf"},"A Benchmark to Understand the Role of Knowledge Graphs on LLM's Accuracy for Q&A on Enterprise SQL Databases"))," from data.world.")),(0,s.kt)("p",null,"These papers are either entirely or ",(0,s.kt)("em",{parentName:"p"},"almost")," entirely evaluation-only papers that experiment with very detailed approaches of prompting LLMs\nto generate SQL queries. First, let me say that the general message these\npapers give (maybe except the last one) is that LLMs are pretty good. With right prompting (or even with basic prompting)\nthey do very well on these benchmarks. I see accuracy rates over 85% on the Spider benchmark in several papers. These are clearly\nbetter numbers than what pre-LLM state-of-the-art systems achieved. This should be impressive to many."),(0,s.kt)("p",null,'Second, the set of techniques are too detailed to cover here but some example heuristics\nthese papers experiment with include the following: (i) the syntax used for providing the schema\n(apparently putting "the pound sign ',(0,s.kt)("inlineCode",{parentName:"p"},"#")," to differentiate prompt from response in examples yields impressive performance gains\" \ud83d\ude00 go figure); (ii)\nthe number and selection of example (question, SQL) pairs, e.g., apparently there is a sweet spot in the number\nof examples to provide; or (iii) the effects of standardizing the text in the prompt, e.g., indenting and using all lower case letters consistently\n(apparently has minor but some effect). Yes, as interesting and important it is to learn how to use LLMs better, I still\ncan't escape the following thought before going to bed: somewhere out there, some advisor might be torturing some graduate student\nto check if the magical box produces better SQL with a pound sign vs double slashes!"),(0,s.kt)("p",null,"Most work I found is on generating SQL.\nIn contrast, I found no papers that do similar prompting study for query languages\nof GDBMS though I ran into two papers that are providing benchmarks for query languages of GDBMSs:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2309.16248"},"SPARQL"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://dl.acm.org/doi/pdf/10.1145/3511808.3557703"},"Cypher"),").\nSo a low-hanging fruit future work is the following:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 1: Similar prompting studies for query languages of graph DBMSs with a focus on recursive and unions of joins queries."),":\nIn contrast to SQL queries, here, one should study various recursive queries that the query languages of GDBMSs are particularly good\nat and union-of-join queries which are asked by omitting labels in the query languages of GDBMSs.\nFor example if you want to ask all connections between\nyour ",(0,s.kt)("inlineCode",{parentName:"p"},"User")," nodes and User can have many relationships, such as ",(0,s.kt)("inlineCode",{parentName:"p"},"Follows"),", ",(0,s.kt)("inlineCode",{parentName:"p"},"SentMoneyTo"),", or ",(0,s.kt)("inlineCode",{parentName:"p"},"SameFamily"),",\nyou would have to write 3 possible join queries in SQL and union them. Instead, you can write this query\nwith a very simple syntax in Cypher as\n",(0,s.kt)("inlineCode",{parentName:"p"},"MATCH (a:User)-[e]->(b:User)"),", where the omissions of the label on the relationship ",(0,s.kt)("inlineCode",{parentName:"p"},"e")," indicates searching over\nall possible joins.",(0,s.kt)("sup",{parentName:"p",id:"fnref-1-a733ad"},(0,s.kt)("a",{parentName:"sup",href:"#fn-1-a733ad",className:"footnote-ref"},"1"))," "),(0,s.kt)("p",null,"As a side note: In the context of any query language, including SQL, questions that require sub-queries are of particular\ninterest as they are generally harder to write. Some of the papers I read had sections analyzing the performance of\nLLMs on nested queries but the focus was not on these. In prior literature there are papers written solely on text-to-SQL generation for\nnested queries (e.g., see ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol13/p2747-sen.pdf"},"the ATHENA++ paper"),"). I am certain someone\nsomewhere is already focusing solely on nested queries and that's a good idea."),(0,s.kt)("h2",{id:"dataworld-paper-and-some-interesting-questions"},"data.world Paper and Some Interesting Questions"),(0,s.kt)("p",null,"In the remainder of the post I want to review ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2311.07509.pdf"},"the benchmark paper")," from ",(0,s.kt)("inlineCode",{parentName:"p"},"data.world")," that focuses on text-to-SQL using LLMs. Unlike other papers out there that\nstudy the effects of different prompting heuristics, this paper studies the ",(0,s.kt)("em",{parentName:"p"},"effects of data modeling\non the accuracy of SQL queries generated by LLMs"),", which is closely related to GDBMSs. "),(0,s.kt)("p",null,"Specifically, this paper is an evaluation of the performance of GPT-4 in generating SQL using no examples, i.e., zero-shot,\nwith basic prompting over a standardized insurance database schema\ncalled The ",(0,s.kt)("a",{parentName:"p",href:"https://www.omg.org/spec/PC/1.0/About-PC"},"OMG Property and Casualty Data Model"),".\nSee Figure 1 in the paper (omitted here) for the conceptual schema, which consists of classes such as\nPolicy, Account, Claims, Insurable Object, among others, and their relationships.\nThe paper has a benchmark of 43 natural language questions and compares 2 approaches to generate the SQL query.\nThe below figure shows an overview of these approaches for reference:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:p.Z,width:"600"})),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},"Direct SQL Generation: In this approach, ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," and the relational schema of the OMG database is given\nto GPT-4. The schema is given in terms of ",(0,s.kt)("inlineCode",{parentName:"li"},"CREATE TABLE")," statements, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre",className:"language-sql"},"CREATE TABLE Claim(\nClaim_Identifier int NOT NULL,\nCatastrophe_Identifier int NULL,\n...\nClaim_Open_Date datetime NULL ,\n ...\n PRIMARY KEY (Claim_Identifier ASC),\n FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier),\n...)\n")),"The full schema statements can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"here"),".\nGPT-4 is asked to generate a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to answer ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nCopy-pasted from the paper, these prompts look as follows:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the database described by the following DDL:\n\nWrite a SQL query that answers the following question. Do not explain the query. return just the query, so it can be run\nverbatim from your response.\nHere\u2019s the question:\n\n"))),(0,s.kt)("li",{parentName:"ol"},"Indirect SQL Generation via Graph Modeling/SPARQL: In this approach, instead of the relational schema of the database, the same\ndatabase is modeled as an ",(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://www.w3.org/OWL/"},"OWL ontology"))," (OWL is short for Web Ontology Language).\nOntology is another term for schema when modeling data as graph as classes and relationships between them. OWL is a W3C standard\nand part of the RDF technology stack so OWL ontologies are expressed as a set RDF triples, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},'...\nin:Claim rdf:type owl:Class ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim" .\nin:claimOpenDate rdf:type owl:DatatypeProperty ;\n rdfs:domain in:Claim ;\n rdfs:range xsd:dateTime ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim Open Date" .\nin:hasCatastrophe rdf:type owl:ObjectProperty ;\n rdfs:domain in:Claim ;\n rdfs:range in:Catastrophe ;\n rdfs:isDefinedBy ;\n rdfs:label "has catastrophe" .\n...\n')),"The full ontology can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"here"),".\nGPT-4 is then asked to generate a SPARQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", instead of SQL, for the same ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". The full prompt, again copy-pasted\nfrom the paper with some simplifications, looks like this:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the OWL model described in the following TTL file:\n\nWrite a SPARQL query that answers the question. Do not explain the query. return just the query, so it can be run verbatim from your response.\nHere\u2019s the question:\n\n")),"As a last step, the authors have a direct mapping from ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". This is a quite straigh-forward step\nas the modeling as an ontology vs relational schema have direct translations from classes and properties to tables and columns.")),(0,s.kt)("p",null,"An interesting comparison. There is some intuition for why one would be interested in the effectiveness of\nquery generation through an ontology because one of the well-known\npre-LLM text-to-SQL papers ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA")," did something similar.\nInstead of SPARQL they had another query language over an ontology called Ontology Query Language, which\nwas then mapped to SQL. "),(0,s.kt)("p",null,"The results are even more interesting. The authors categorize their 43 questions into\n4 quadrants based on 2 dimensions: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"question")," complexity: Questions that require only simple projections\nare low complexity. Those that require aggregations or math functions are high complexity."),(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"schema")," complexity: Questions whose SQL queries require up to 4 tables are low schema complexity. Those that\nrequire 5 or more joins are high schema complexity. ")),(0,s.kt)("p",null,'The accuracy results are shown below. Accuracy here is "execution accuracy" meaning that only the answers of the queries\nare checked against the ground truth answer. That is, even if the SQL query GPT-4 generated was actually not correct\nbut by luck it computed the correct answers the paper takes it as correct (apparently happens very rarely in this study).'),(0,s.kt)("table",null,(0,s.kt)("thead",{parentName:"table"},(0,s.kt)("tr",{parentName:"thead"},(0,s.kt)("th",{parentName:"tr",align:null},"Overall: 16.7% vs 54.2%"),(0,s.kt)("th",{parentName:"tr",align:null},"Low Schema Complexity"),(0,s.kt)("th",{parentName:"tr",align:null},"High Schema Complexity"))),(0,s.kt)("tbody",{parentName:"table"},(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"Low Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"37.4% vs 66.9%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 38.7%")),(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"High Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"25.5% vs 71.1%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 35.7%")))),(0,s.kt)("p",null,"Overall, the indirect SQL generation method through SPARQL is much more effective in this zero-shot setting.\nNot surprisingly, questions that require 5 or more joins are harder regardless of the\nmethod used and direct SQL cannot get any of those questions right. These are interesting\nresults for an initial study on the effects of data modeling on LLMs' accuracy on generating database queries.\nThese results should give many researchers and practitioners ideas about how to replicate\nand validate/invalidate similar results under different settings, e.g., with few-shot\nexamples and under different databases."),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"That said, one should ask, why?")," In fact, we should all be suspicious that merely modeling the\nsame set of records with a different abstraction should have any visible effects. After all, by modeling\nthe same records differently, one does not obtain or lose information. So if and when LLMs are smart enough,\nthey shouldn't care how the data was modeled. But for now, if a pound sign can make a difference,\nwe should not be surprised modeling choices can have large impacts. As such, it is healthy to be suspicious\nand ask why. These motivate a few important questions I think are worth studying. My premise\nis that somehow if the differences are this large, it must be that the task for GPT-4 got simpler when\nasked to generate a SPARQL query. I can hypothesize about a few possible reasons for this: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("p",{parentName:"li"},(0,s.kt)("em",{parentName:"p"},"Some queries require fewer tokens to write in SPARQL"),": One difference the query languages\nof GDBMSs often have is that certain equality conditions are implicit in the syntax, which\nmeans their ",(0,s.kt)("inlineCode",{parentName:"p"},"WHERE")," clauses are simpler for some queries. For example if you wanted to return\nthe names of the Catastrophe that Claim with ID Claim1 has, in SPARQL you can write it as:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?name\nWHERE { in:hasCatastrophe ?catastrophe,\n ?catastophe in:catastropheName ?name}\n")),(0,s.kt)("p",{parentName:"li"},"In SQL you would write:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Catastrophe_Name\nFROM Claim, Catastrophe\nWHERE Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier AND\n Claim.Claim_Identifier = Claim1\n")),(0,s.kt)("p",{parentName:"li"},"Note that the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Claim_Identifier = Claim1")," equality condition is implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"}," in:hasCatastrophe ?catastrophe")," triple\nand the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier")," condition is implicit in the fact that ",(0,s.kt)("inlineCode",{parentName:"p"},"?catastrophe")," appears\nboth in the first and second triples in the SPARQL query. Such implicit equality conditions are common in the languages of\ngraph query languages especially when expressing joins. For example in Cypher you can omit all join conditions in WHERE clauses as long\nas those joins have been pre-defined to the system as relationships. Instead you join records through the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," syntax.\nIt's unclear how much this could matter but it is an immediate advantage of SPARQL that can explain why complex join queries are easier to generate\nin SPARQL than SQL. "),(0,s.kt)("p",{parentName:"li"},(0,s.kt)("strong",{parentName:"p"},"Side note"),": On the flip side, SPARQL can be more verbose in projections. For example, if you wanted to return the number, open and close\ndates of every claim, you'd write the following SQL query:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Claim_Number, Claim_Open_Date, Claim_Close_Date\nFROM Claim\n")),(0,s.kt)("p",{parentName:"li"},"In SPARQL, you'd have to write both the names of the property you want to project and give it an additional variable as follows:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?number, ?open_date, ?close_date\nWHERE { ?claim in:claimNumber ?number,\n ?claim in:claimOpenDate ?open_date,\n ?claim in:claimCloseDate ?close_date\n")))),(0,s.kt)("ol",{start:2},(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Graph modeling gives explicit names to foreign keys:")," There is a reason that database courses teach database modeling to students\nusing graph-based models, such as Entity-Relationship or UML models. First, humans think of the world\nas objects/entities and their relationships. In some sense, these are higher-level models where relationships\nbetween objects are denoted explicitly with explicit names (instead of as less explicit foreign key constraints).\nFor example, the implicit connection between Claims and\nCatastrophes through the ",(0,s.kt)("inlineCode",{parentName:"li"},"FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier)"),"\nconstraint was given an explicit English name: ",(0,s.kt)("inlineCode",{parentName:"li"},"hasCatastrophe")," in the ontology. This explicitness may make\nit easier for LLMs to understand the schema and generate SPARQL queries.")),(0,s.kt)("p",null,"Both of these are qualitative hypotheses. however, there is a more immediate\nreason the authors of this paper may have obtained such major differences between the two approaches they tried.\nIntentionally or unintentionally, their ontology is simplified significantly compared to the relational schema they have.\nFor example, the Claim relation has ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Reopen_Date")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Status_Code")," properties which are removed from the ontology.\nMany such properties from the relations seem to have been removed, and the ontology overall looks simpler.\nThere are also several differences between the ontology and the relational schema that are confusing. For example\nthe ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"ontology"),"\nhas a class ",(0,s.kt)("inlineCode",{parentName:"p"},"Agent")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Policy")," objects are ",(0,s.kt)("inlineCode",{parentName:"p"},"in:soldByAgent")," by some Agent objects (see lines 20 and 92). I cannot\nsee corresponding relations or columns in the ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"relational schema"),". Unless I am missing something about how the prompts were given,\nthese are also likely to have important effects on the results and someone should fix and obtain new results\nin a more fair comparison."),(0,s.kt)("p",null,"Let me next raise several high-level questions that I think are important:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2: Rules of thumbs in data modeling to make LLM-generated queries more accurate."),"\nI think the higher-level question of studying the effects of data modeling in more depth is a very good direction.\nAs LLMs get smarter, I would expect that the presence/absence of a pound sign or the style of English\nshould matter less. These look more like syntactic differences that can be automatically detected over time.\nModeling choices are more fundamental and relate to the clarity and understandibility of the records that will be queried by the LLM.\nSo identifying some rules of thumb here looks like the promising path forward. Let me list a few immediate questions one can study:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.1: Effects of normalization/denormalization.")," If the shortcoming of GPT-4 is\ngenerating queries with many joins, one way to solve this is to denormalize the relations into fewer\ntables and study its effects. Again, I'm thinking of same records just modeled differently with fewer\ntables. What happens if we reduce all data into a single table with dozens of columns and many value repetitions?\nNow all possible joins would have been performed so we'd force the LLM to write a join-less query with\nfilters, distincts, and aggregations. What happens if we normalize the tables step-by-step until we\nget to a well known form, such as ",(0,s.kt)("a",{parentName:"p",href:"https://en.wikipedia.org/wiki/Boyce%E2%80%93Codd_normal_form"},"Boyce-Codd Normal Form"),"? Do we consistently get better or worse accuracy?"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.2: Use of views.")," In relational modeling, views are an effective way to have higher\nand simpler modeling of your records. Similar to a ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline,\none can test the effectiveness of ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"o"),(0,s.kt)("mi",{parentName:"mrow"},"v"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"r"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"V"),(0,s.kt)("mi",{parentName:"mrow"},"i"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"w"),(0,s.kt)("mi",{parentName:"mrow"},"s")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL-over-Views}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"o"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.03588em"}},"v"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02778em"}},"er"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"Vi"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"e"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02691em"}},"w"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"s"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline."),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 3: Use of Cypher as intermediate query language to translate to SQL.")," One reason to experiment with Cypher\nin addition to SPARQL is that Cypher is, arguably, more similar to SQL than SPARQL but has the advantage that (common) join\nconditions are implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," node-arrow syntax. Yet Cypher does not have the verbosity of the SPARQL projections\nI mentioned above (so you project properties the same way you project columns in SQL). In my world, all high-level query languages\nlook very similar to SQL, so eventually when LLMs are smart enough, or even today, I think these language differences\nshould have minor effects. However, graph query languages will likely continue to have major advantages when writing\nrecursive queries, as they have specialized syntax (e.g., Cypher has the Kleene star syntax) to do so. For those queries,\nexpressing first in Cypher and then mapping to SQL could lead to an advantage. "),(0,s.kt)("h2",{id:"final-words"},"Final Words"),(0,s.kt)("p",null,"Needless to say, in the next few years, the field will be flooded with work on how to\nuse LLMs to solve the text-to-high-level-query problem. Many rules of thumb will emerge\nabout how to prompt them correctly. The questions one can ask in this space is endless.\nI can speculate about it a lot, but I think it's plausible that\nmany of these rules of thumb, specifically the syntactic\ndifferences in prompting, can become\nobsolete very quickly as newer and more advanced LLMs that are better at speaking high-level database languages emerge.\nFor example, it's plausible that people will stop showing LLMs example (question, query) pairs each time they ask them to generate\nSQL once LLMs are better at speaking SQL."),(0,s.kt)("p",null,"However, the harder question of how to model the data so that its meaning is clear, and the\nqueries that need to be written, are simpler, is more likely to remain a challenge for a longer time. I would not be too optimistic\nthat there can emerge very clear answers to this question. How to model your data is part-art and part-science.\nYet, some studiable questions, such as the effects of normalization, use of views or generating Cypher for recursive queries,\ncan yield some important best practices that can be useful to developers building these systems."),(0,s.kt)("p",null,"In the next post, I will cover what I learned about RAG over unstructured data. Graphs and knowledge graphs are playing\na more interesting role in that space. Until then, happy new year to all!"),(0,s.kt)("div",{className:"footnotes"},(0,s.kt)("hr",{parentName:"div"}),(0,s.kt)("ol",{parentName:"div"},(0,s.kt)("li",{parentName:"ol",id:"fn-1-a733ad"},"SPARQL syntax is different but a similar advantage exists by omitting type constraints.",(0,s.kt)("a",{parentName:"li",href:"#fnref-1-a733ad",className:"footnote-backref"},"\u21a9")))))}d.isMDXComponent=!0},6477:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/qa-over-enterprise-data-c63fb036791e4a0ec1f24d802d50254e.png"},2045:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/rag-using-structured-data-4e7c4e780c6a85b5664a3aa3f3f6c5a9.png"},4287:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/two-sql-generation-approaches-cceca4532513a6bb7ccae29e3f3ca94f.png"}}]); \ No newline at end of file diff --git a/docusaurus/assets/js/beadeff4.63d91510.js b/docusaurus/assets/js/beadeff4.63d91510.js deleted file mode 100644 index a5055121c..000000000 --- a/docusaurus/assets/js/beadeff4.63d91510.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkkuzu_docs=self.webpackChunkkuzu_docs||[]).push([[3943],{3905:(a,e,t)=>{t.d(e,{Zo:()=>l,kt:()=>k});var n=t(7294);function s(a,e,t){return e in a?Object.defineProperty(a,e,{value:t,enumerable:!0,configurable:!0,writable:!0}):a[e]=t,a}function m(a,e){var t=Object.keys(a);if(Object.getOwnPropertySymbols){var n=Object.getOwnPropertySymbols(a);e&&(n=n.filter((function(e){return Object.getOwnPropertyDescriptor(a,e).enumerable}))),t.push.apply(t,n)}return t}function r(a){for(var e=1;e=0||(s[t]=a[t]);return s}(a,e);if(Object.getOwnPropertySymbols){var m=Object.getOwnPropertySymbols(a);for(n=0;n=0||Object.prototype.propertyIsEnumerable.call(a,t)&&(s[t]=a[t])}return s}var i=n.createContext({}),o=function(a){var e=n.useContext(i),t=e;return a&&(t="function"==typeof a?a(e):r(r({},e),a)),t},l=function(a){var e=o(a.components);return n.createElement(i.Provider,{value:e},a.children)},h="mdxType",N={inlineCode:"code",wrapper:function(a){var e=a.children;return n.createElement(n.Fragment,{},e)}},c=n.forwardRef((function(a,e){var t=a.components,s=a.mdxType,m=a.originalType,i=a.parentName,l=p(a,["components","mdxType","originalType","parentName"]),h=o(t),c=s,k=h["".concat(i,".").concat(c)]||h[c]||N[c]||m;return t?n.createElement(k,r(r({ref:e},l),{},{components:t})):n.createElement(k,r({ref:e},l))}));function k(a,e){var t=arguments,s=e&&e.mdxType;if("string"==typeof a||s){var m=t.length,r=new Array(m);r[0]=c;var p={};for(var i in e)hasOwnProperty.call(e,i)&&(p[i]=e[i]);p.originalType=a,p[h]="string"==typeof a?a:s,r[1]=p;for(var o=2;o{t.r(e),t.d(e,{assets:()=>h,contentTitle:()=>o,default:()=>d,frontMatter:()=>i,metadata:()=>l,toc:()=>N});var n=t(7462),s=(t(7294),t(3905)),m=t(6477),r=t(2045),p=t(4287);const i={slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},o="RAG Using Structured Data: Overview & Important Questions",l={permalink:"/docusaurus/blog/llms-graphs-part-1",source:"@site/blog/2024-01-04-llms-graphs-part-1/index.md",title:"RAG Using Structured Data: Overview & Important Questions",description:"During the holiday season, I did some reading on",date:"2024-01-04T00:00:00.000Z",formattedDate:"January 4, 2024",tags:[{label:"use-case",permalink:"/docusaurus/blog/tags/use-case"}],readingTime:25.13,hasTruncateMarker:!1,authors:[{name:"Semih Saliho\u011flu",title:"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo",url:"https://cs.uwaterloo.ca/~ssalihog/",imageURL:"https://kuzudb.com/img/blog/semih.jpg",key:"semih"}],frontMatter:{slug:"llms-graphs-part-1",authors:["semih"],tags:["use-case"]},nextItem:{title:"K\xf9zu 0.1.0 Release",permalink:"/docusaurus/blog/kuzu-0.1.0-release"}},h={authorsImageUrls:[void 0]},N=[{value:"Killer App: Retrieval Augmented Generation",id:"killer-app-retrieval-augmented-generation",level:2},{value:"A note on the term RAG",id:"a-note-on-the-term-rag",level:3},{value:"Summary of this post",id:"summary-of-this-post",level:3},{value:"RAG Using Structured Data: Text-to-High-level-Query",id:"rag-using-structured-data-text-to-high-level-query",level:2},{value:"Overview",id:"overview",level:3},{value:"Simplicity of Developing RAG Systems: LangChain and LlamaIndex",id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex",level:3},{value:"How Good Are LLMs in Generating High-Level Queries?",id:"how-good-are-llms-in-generating-high-level-queries",level:3},{value:"data.world Paper and Some Interesting Questions",id:"dataworld-paper-and-some-interesting-questions",level:2},{value:"Final Words",id:"final-words",level:2}],c={toc:N},k="wrapper";function d(a){let{components:e,...t}=a;return(0,s.kt)(k,(0,n.Z)({},c,t,{components:e,mdxType:"MDXLayout"}),(0,s.kt)("p",null,"During the holiday season, I did some reading on\nLLMs and specifically on the techniques that use LLMs together with graph databases and knowledge graphs.\nIf you are new to the area like me, the amount of activity on this topic on social\nmedia as well as in research publications may have intimidated you.\nIf so, you're exactly my target audience for this new blog post series I am starting.\nMy goals are two-fold: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview the area"),": I want to present what I learned with a simple and consistent terminology and at\na more technical depth than you might find in other blog posts. I am aiming a depth similar to what I aim when preparing\na lecture. I will link to many quality and technically satisfying pieces of content (mainly papers since the area is very researchy)."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Overview important future work"),": I want to cover several important future works in the space. I don't\nnecessarily mean work for research contributions but also simple approaches to experiment with if you are\nbuilding question answering (Q&A) applications using LLMs and graph technology.")),(0,s.kt)("h2",{id:"killer-app-retrieval-augmented-generation"},"Killer App: Retrieval Augmented Generation"),(0,s.kt)("p",null,"Let's review the killer application of LLMs in enterprises.\nThe application is ultimately Q&A over private enterprise data. Think of a chatbot to which you\ncan ask natural language questions (",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),'), such as: "Who is our top paying customer from Waterloo?",\nor "What are data privacy regulations in Canada we need to comply with?"\nand get back natural language answers (',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),").\nLLMs, out of the box, cannot answer these questions because they have a ",(0,s.kt)("em",{parentName:"p"},"knowledge gap"),".\nFor example, LLMs never had any access to your sales records when they were trained.\nTherefore, they need to retrieve or be provided with\nextra information from private data sources of the enterprise."),(0,s.kt)("h3",{id:"a-note-on-the-term-rag"},"A note on the term RAG"),(0,s.kt)("p",null,"There seems to be tremendous interest in building systems that combine a traditional\ninformation retrieval component, e.g., one that looks up some documents from\nan index, with a natural language generator component, such as an LLM. The term for such systems is\n",(0,s.kt)("em",{parentName:"p"},"Retrieval Augmented Generation")," (RAG).\nThe term is coined in ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2005.11401.pdf"},"this paper"),' to refer\nto the method of fine-tuning an LLM with additional information, i.e.,\nusing this additional data to train a new variant of the LLM.\nThe original usage form in the paper is "RAG models". Nowadays it is used in a variety of ways,\nsuch as, "RAG system", "RAG-based system", "RAG does X", or\n"Building RAG with Y". RAG often does not refer to fine-tuning LLMs any more. Instead, it\nrefers to providing LLMs with private data along with the question to fix the knowledge gap.\nEven systems that simply use an LLM to convert a\n',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL or Cypher query and simply return the results of the query\nare called "RAG systems" in some documentations. I will use the term in this broader sense.'),(0,s.kt)("p",null,"You can build RAG-based Q&A systems by using structured and/or unstructured\ndata. The high-level views of these systems look like this:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:m.Z,width:"600"})),(0,s.kt)("h3",{id:"summary-of-this-post"},"Summary of this post"),(0,s.kt)("p",null,"This post covers RAG using structured data. Then, in a follow up post, I will cover RAG using unstructured data, where\nI will also mention a few ways people are building RAG-based Q&A\xa0systems that use both structured and unstructured data."),(0,s.kt)("admonition",{title:"TL;DR: The key takeaways from this post are:",type:"tip"},(0,s.kt)("ul",{parentName:"admonition"},(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"RAG overview"),": RAG is a technique to fill the knowledge gap of LLMs using private data. RAG systems\nuse private structured records stored in a database and/or unstructured data in text files. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Impressive simplicity and effectiveness of developing a natural language interface over your database using LLMs"),": In the pre-LLM era, the amount of engineering effort\nto develop a pipeline that delivered a natural language interface over your database was ",(0,s.kt)("em",{parentName:"li"},"immense"),". The\nhard problem was to teach a model to ",(0,s.kt)("em",{parentName:"li"},"speak"),' SQL, Cypher, or SPARQL.\nThis contrasts sharply with the simplicity of developing similar pipelines now because LLMs already "speak" these languages.\nThe hard task now is for ',(0,s.kt)("em",{parentName:"li"},"developers to learn how to prompt LLMs")," to get correct database queries. Furthermore, there is\nevidence that LLMs, if prompted correctly, will generate a decent proportion of queries with impressive accuracy. "),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Lack of work that studies LLMs' ability to generate Cypher or SPARQL:")," Most technically-deep work on understanding\nLLMs' ability to generate accurate high-level query languages is on SQL. We need more\nwork understanding the behavior of LLMs on the query languages of GDBMSs (like Cypher or SPARQL), specifically on recursive and union-of-join queries."),(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("strong",{parentName:"li"},"Studying the effects of data modeling (normalization, views, graph modeling) on the accuracy of LLM-generated queries is important:"),"\nMany people are studying heuristics for prompting LLMs to increase their efficiency focusing on the syntax and the structure of providing\nthe schema and selection of examples in the prompt. An important and under-studied\nproblem is the effects of data modeling choices on the accuracy of the queries generated by LLMs. I point to ",(0,s.kt)("a",{parentName:"li",href:"https://arxiv.org/pdf/2311.07509.pdf"},"one interesting paper")," in this space and raise several questions related to\nnormalizations and use of views in relational modeling and comparisons with graph modeling approaches. "))),(0,s.kt)("h2",{id:"rag-using-structured-data-text-to-high-level-query"},"RAG Using Structured Data: Text-to-High-level-Query"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},'Note: If you are familiar with how to develop RAG systems with LangChain and LlamaIndex, you can directly skip\nto the "',(0,s.kt)("a",{parentName:"em",href:"#how-good-are-llms-in-generating-high-level-queries"},"How Good are LLMs in Generating High-level Queries"),'" part that\nreflects on the reading I did on RAG using structured data.')),(0,s.kt)("h3",{id:"overview"},"Overview"),(0,s.kt)("p",null,"Many blog posts and several papers concern Q&A systems that simply convert\n",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query languge, such as SQL, Cypher, or SPARQL, using an LLM.\nThe figure below describes the overall approach:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:r.Z,width:"600"})),(0,s.kt)("p",null,(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),', the schema of a database, and optionally\nsome example natural language question and high-level query examples, are given\nto the LLM as a prompt.\nThe terms "no shot", "one shot", or "few shot" refer to the number of examples provided\nin the prompt. Depending on the underlying database, the schema may contain\ncolumns of relational tables and their descriptions, or labels of nodes and edges\nof a graph database. Using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", the database schema, and optionally\nsome examples, the LLM generates\na database query, such as SQL or Cypher. The system runs this query against the\nDBMS and returns back the query result or using the LLM again, converts\nthe query result back to a natural language answer ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"A"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"A_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8333em",verticalAlign:"-0.15em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"A"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". "),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"Let us pause here to appreciate one thing:")," For many decades, the database community has studied the problem\nof converting ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),' to SQL (aka "text-to-SQL"). Here is a good recent ',(0,s.kt)("a",{parentName:"p",href:"https://link.springer.com/article/10.1007/s00778-022-00776-8"},"survey paper"),"\nthat covers only the deep network-based approaches and ",(0,s.kt)("a",{parentName:"p",href:"https://www.nowpublishers.com/article/Details/DBS-078"},"a more extensive survey/book"),"\non the broader topic of natural language interfaces to databases.\nNeither of these surveys cover any work that directly uses LLMs such as GPT models,\nwhich are quite recent developments. Take any of the work covered in these surveys and\nyou'll find an approach that requires significant engineering to build the pipeline shown in the above figure.\nThere exist several pre-LLM text-to-SQL systems (e.g., ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA"),"\nor ",(0,s.kt)("a",{parentName:"p",href:"https://download.hrz.tu-darmstadt.de/pub/FB20/Dekanat/Publikationen/UKP/76500354.pdf"},"BELA"),").\nFor example, most of the pre-LLM approaches that use deep learning require\nhard work ",(0,s.kt)("em",{parentName:"p"},'to teach a model how to "speak" SQL')," using large\ncorpora of tables and (question, query) examples, such as ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/1709.00103"},"WikiSQL")," or ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/taoyds/spider"},"Spider"),".\nPeople had to solve and glue-together solutions to many technical problems, such as parsing the question,\nentity detection, synonym finding, string similarity, among others.\nPost-LLM approaches require ",(0,s.kt)("em",{parentName:"p"},"none")," of these efforts because LLMs, such as GPT-4, already speak SQL, Cypher, and SPARQL out of the box, having been exposed to them in their pretraining.\nNowadays, the hard problem now is for developers ",(0,s.kt)("em",{parentName:"p"},"to learn how to prompt LLMs")," so that\nLLMs generate correct queries. I'll say more about this problem. In contrast, building the above pipeline requires much less effort as\nI'll show next."),(0,s.kt)("h3",{id:"simplicity-of-developing-rag-systems-langchain-and-llamaindex"},"Simplicity of Developing RAG Systems: LangChain and LlamaIndex"),(0,s.kt)("p",null,"If you have been following the developments in the LLM space, you will not be surprised to hear that nowadays people build\nQ&A systems that convert ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a high-level query language using two common tools:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://www.langchain.com/"},"LangChain"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://www.llamaindex.ai/"},"LlamaIndex"),".\nThe same tools also integrate with the underlying storage system to load and retrieve your data. To make this more concrete, let me review the ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"K\xf9zu-LangChain integration"),", similar to the integrations found in other GDBMSs. You as a programmer have very little to do: you prepare your K\xf9zu\ndatabase ",(0,s.kt)("inlineCode",{parentName:"p"},"db")," and load your data into it, wrap it around a ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," objects in Python and you have\na text-to-Cypher pipeline:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-python"},'import kuzu\nfrom langchain.chains import KuzuQAChain\nfrom langchain_community.chat_models import ChatOpenAI\nfrom langchain_community.graphs import KuzuGraph\n\ndb = kuzu.Database("test_db")\n... // create your graph if needed\ngraph = KuzuGraph(db)\nchain = KuzuQAChain.from_llm(ChatOpenAI(temperature=0), graph=graph, verbose=True)\nchain.run("Who played in The Godfather: Part II?")\n')),(0,s.kt)("p",null,"I am following the example application in this ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa"},"documentation"),",\nwhich uses a database of movies, actors, and directors. "),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Output:\n> Entering new chain...\nGenerated Cypher:\nMATCH (p:Person)-[:ActedIn]->(m:Movie {name: 'The Godfather: Part II'}) RETURN p.name\nFull Context:\n[{'p.name': 'Al Pacino'}, {'p.name': 'Robert De Niro'}]\n\n> Finished chain.\n\n'Al Pacino and Robert De Niro both played in The Godfather: Part II.'\n")),(0,s.kt)("p",null,'The "chain" first generated a Cypher query using ',(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nBehind the curtain, i.e., inside the KuzuQAChain code,\na GPT model was given the following prompt:"),(0,s.kt)("pre",null,(0,s.kt)("code",{parentName:"pre",className:"language-bash"},"Generate Cypher statement to query a graph database.\nInstructions:\nUse only the provided relationship types and properties in the schema.\nDo not use any other relationship types or properties that are not provided.\n\nSchema:\nNode properties: [{'properties': [('name', 'STRING')], 'label': 'Movie'}, {'properties': [('name', 'STRING'), ('birthDate', 'STRING')], 'label': 'Person'}]\nRelationships properties: [{'properties': [], 'label': 'ActedIn'}]\nRelationships: ['(:Person)-[:ActedIn]->(:Movie)']\n\nNote: Do not include any explanations or apologies in your responses.\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\nDo not include any text except the generated Cypher statement.\n\nThe question is:\nWho played in The Godfather: Part II?\n")),(0,s.kt)("p",null,"Indeed, if you copy this prompt and paste it in ",(0,s.kt)("a",{parentName:"p",href:"https://chat.openai.com/"},"chatGPT's browser interface"),",\nyou will get the same or very similar Cypher query. The important point is: that's all\nthe coding you have to do to build a natural language interface that can query your database.\nYou ultimately construct a string prompt that contains ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", some\ninstructions, and schema of the database, and the LLM will generate a query for you.\nThe ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuGraph")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"KuzuQAChain")," are simple wrappers to do just that.\nIf you want to play around with how well this works on other datasets,\nwe have this pipeline implemented in K\xf9zu's browser frontend ",(0,s.kt)("a",{parentName:"p",href:"https://kuzudb.com/docusaurus/kuzuexplorer/"},"K\xf9zuExplorer"),". "),(0,s.kt)("p",null,'That is, for any database you have in K\xf9zu, you get a natural language interface over it in\nK\xf9zuExplorer (just click the "robot icon" on the left panel).\nYou can develop similar pipelines with other GDBMSs using similar interfaces (',(0,s.kt)("em",{parentName:"p"},"though I recommend using K\xf9zu as it will be the\nsimplest to get started")," \ud83d\ude09: ",(0,s.kt)("em",{parentName:"p"},"Unlike other GDBMSs, K\xf9zu is embeddable and requires no server set up"),").\nIf you instead want to build Q&A systems over your RDBMSs, you can use\nLangChain's ",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-2-text-to-sql-query-and-execution"},"SQLDatabaseChain")," and\n",(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/use_cases/qa_structured/sql#case-3-sql-agents"},"SQLAgent")," or\nLlamaIndex's ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/SQLIndexDemo.html#part-1-text-to-sql-query-engine"},"NLSQLTableQueryEngine"),'. The level of simplicity is similar to the example I presented. In practice, it is unlikely that your chatbot or search engine will be as simple\nas the above example where the application interacts with the LLM only once. If you want\nto interact with the LLM multiple times and conditionally take one action over another action etc.,\nLangChain and LlamaIndex also provide ways to do that through their "Agents" (see ',(0,s.kt)("a",{parentName:"p",href:"https://python.langchain.com/docs/modules/agents/"},"LangChain Agents")," and ",(0,s.kt)("a",{parentName:"p",href:"https://docs.llamaindex.ai/en/stable/use_cases/agents.html"},"Llama Index Agents"),")."),(0,s.kt)("h3",{id:"how-good-are-llms-in-generating-high-level-queries"},"How Good Are LLMs in Generating High-Level Queries?"),(0,s.kt)("p",null,"Although building a text-to-high-level-query-language pipeline is now very simple with LLMs,\nsimplicity ",(0,s.kt)("strong",{parentName:"p"},"does not")," mean quality. Indeed, people building these systems are now faced with the following two important questions: "),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How accurate are the high-level queries that LLMs generate?")),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"How, e.g., through what types of prompts or data modeling, can we increase the accuracy of the\nqueries generated by LLMs?"))),(0,s.kt)("p",null,"Here are several papers on this that I suggest reading:"),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2303.13547.pdf"},"A comprehensive evaluation of ChatGPT\u2019s zero-shot Text-to-SQL capability"))," from Tsinghua University and University of Illinois at Chicago. "),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2204.00498.pdf"},"Evaluating the Text-to-SQL Capabilities of Large Language Models"))," from researchers from Cambridge and universities and institutes from Montr\xe9al."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2308.15363.pdf"},"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation"))," from Alibaba Group."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.12586.pdf"},"Enhancing Few-shot Text-to-SQL Capabilities of Large Language Models: A Study on Prompt Design Strategies"))," from Yale, Columbia, and Allen Institute for AI."),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2305.11853.pdf"},"How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings"))," from Ohio State"),(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://arxiv.org/pdf/2311.07509.pdf"},"A Benchmark to Understand the Role of Knowledge Graphs on LLM's Accuracy for Q&A on Enterprise SQL Databases"))," from data.world.")),(0,s.kt)("p",null,"These papers are either entirely or ",(0,s.kt)("em",{parentName:"p"},"almost")," entirely evaluation-only papers that experiment with very detailed approaches of prompting LLMs\nto generate SQL queries. First, let me say that the general message these\npapers give (maybe except the last one) is that LLMs are pretty good. With right prompting (or even with basic prompting)\nthey do very well on these benchmarks. I see accuracy rates over 85% on the Spider benchmark in several papers. These are clearly\nbetter numbers than what pre-LLM state-of-the-art systems achieved. This should be impressive to many."),(0,s.kt)("p",null,'Second, the set of techniques are too detailed to cover here but some example heuristics\nthese papers experiment with include the following: (i) the syntax used for providing the schema\n(apparently putting "the pound sign ',(0,s.kt)("inlineCode",{parentName:"p"},"#")," to differentiate prompt from response in examples yields impressive performance gains\" \ud83d\ude00 go figure); (ii)\nthe number and selection of example (question, SQL) pairs, e.g., apparently there is a sweet spot in the number\nof examples to provide; or (iii) the effects of standardizing the text in the prompt, e.g., indenting and using all lower case letters consistently\n(apparently has minor but some effect). Yes, as interesting and important it is to learn how to use LLMs better, I still\ncan't escape the following thought before going to bed: somewhere out there, some advisor might be torturing some graduate student\nto check if the magical box produces better SQL with a pound sign vs double slashes!"),(0,s.kt)("p",null,"Most work I found is on generating SQL.\nIn contrast, I found no papers that do similar prompting study for query languages\nof GDBMS though I ran into two papers that are providing benchmarks for query languages of GDBMSs:\n(i) ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/abs/2309.16248"},"SPARQL"),"; and (ii) ",(0,s.kt)("a",{parentName:"p",href:"https://dl.acm.org/doi/pdf/10.1145/3511808.3557703"},"Cypher"),").\nSo a low-hanging fruit future work is the following:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 1: Similar prompting studies for query languages of graph DBMSs with a focus on recursive and unions of joins queries."),":\nIn contrast to SQL queries, here, one should study various recursive queries that the query languages of GDBMSs are particularly good\nat and union-of-join queries which are asked by omitting labels in the query languages of GDBMSs.\nFor example if you want to ask all connections between\nyour ",(0,s.kt)("inlineCode",{parentName:"p"},"User")," nodes and User can have many relationships, such as ",(0,s.kt)("inlineCode",{parentName:"p"},"Follows"),", ",(0,s.kt)("inlineCode",{parentName:"p"},"SentMoneyTo"),", or ",(0,s.kt)("inlineCode",{parentName:"p"},"SameFamily"),",\nyou would have to write 3 possible join queries in SQL and union them. Instead, you can write this query\nwith a very simple syntax in Cypher as\n",(0,s.kt)("inlineCode",{parentName:"p"},"MATCH (a:User)-[e]->(b:User)"),", where the omissions of the label on the relationship ",(0,s.kt)("inlineCode",{parentName:"p"},"e")," indicates searching over\nall possible joins.",(0,s.kt)("sup",{parentName:"p",id:"fnref-1-a733ad"},(0,s.kt)("a",{parentName:"sup",href:"#fn-1-a733ad",className:"footnote-ref"},"1"))," "),(0,s.kt)("p",null,"As a side note: In the context of any query language, including SQL, questions that require sub-queries are of particular\ninterest as they are generally harder to write. Some of the papers I read had sections analyzing the performance of\nLLMs on nested queries but the focus was not on these. In prior literature there are papers written solely on text-to-SQL generation for\nnested queries (e.g., see ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol13/p2747-sen.pdf"},"the ATHENA++ paper"),"). I am certain someone\nsomewhere is already focusing solely on nested queries and that's a good idea."),(0,s.kt)("h2",{id:"dataworld-paper-and-some-interesting-questions"},"data.world Paper and Some Interesting Questions"),(0,s.kt)("p",null,"In the remainder of the post I want to review ",(0,s.kt)("a",{parentName:"p",href:"https://arxiv.org/pdf/2311.07509.pdf"},"the benchmark paper")," from ",(0,s.kt)("inlineCode",{parentName:"p"},"data.world")," that focuses on text-to-SQL using LLMs. Unlike other papers out there that\nstudy the effects of different prompting heuristics, this paper studies the ",(0,s.kt)("em",{parentName:"p"},"effects of data modeling\non the accuracy of SQL queries generated by LLMs"),", which is closely related to GDBMSs. "),(0,s.kt)("p",null,"Specifically, this paper is an evaluation of the performance of GPT-4 in generating SQL using no examples, i.e., zero-shot,\nwith basic prompting over a standardized insurance database schema\ncalled The ",(0,s.kt)("a",{parentName:"p",href:"https://www.omg.org/spec/PC/1.0/About-PC"},"OMG Property and Casualty Data Model"),".\nSee Figure 1 in the paper (omitted here) for the conceptual schema, which consists of classes such as\nPolicy, Account, Claims, Insurable Object, among others, and their relationships.\nThe paper has a benchmark of 43 natural language questions and compares 2 approaches to generate the SQL query.\nThe below figure shows an overview of these approaches for reference:"),(0,s.kt)("div",{class:"img-center"},(0,s.kt)("img",{src:p.Z,width:"600"})),(0,s.kt)("ol",null,(0,s.kt)("li",{parentName:"ol"},"Direct SQL Generation: In this approach, ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," and the relational schema of the OMG database is given\nto GPT-4. The schema is given in terms of ",(0,s.kt)("inlineCode",{parentName:"li"},"CREATE TABLE")," statements, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre",className:"language-sql"},"CREATE TABLE Claim(\nClaim_Identifier int NOT NULL,\nCatastrophe_Identifier int NULL,\n...\nClaim_Open_Date datetime NULL ,\n ...\n PRIMARY KEY (Claim_Identifier ASC),\n FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier),\n...)\n")),"The full schema statements can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"here"),".\nGPT-4 is asked to generate a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to answer ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),".\nCopy-pasted from the paper, these prompts look as follows:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the database described by the following DDL:\n\nWrite a SQL query that answers the following question. Do not explain the query. return just the query, so it can be run\nverbatim from your response.\nHere\u2019s the question:\n\n"))),(0,s.kt)("li",{parentName:"ol"},"Indirect SQL Generation via Graph Modeling/SPARQL: In this approach, instead of the relational schema of the database, the same\ndatabase is modeled as an ",(0,s.kt)("em",{parentName:"li"},(0,s.kt)("a",{parentName:"em",href:"https://www.w3.org/OWL/"},"OWL ontology"))," (OWL is short for Web Ontology Language).\nOntology is another term for schema when modeling data as graph as classes and relationships between them. OWL is a W3C standard\nand part of the RDF technology stack so OWL ontologies are expressed as a set RDF triples, such as:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},'...\nin:Claim rdf:type owl:Class ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim" .\nin:claimOpenDate rdf:type owl:DatatypeProperty ;\n rdfs:domain in:Claim ;\n rdfs:range xsd:dateTime ;\n rdfs:isDefinedBy ;\n rdfs:label "Claim Open Date" .\nin:hasCatastrophe rdf:type owl:ObjectProperty ;\n rdfs:domain in:Claim ;\n rdfs:range in:Catastrophe ;\n rdfs:isDefinedBy ;\n rdfs:label "has catastrophe" .\n...\n')),"The full ontology can be found ",(0,s.kt)("a",{parentName:"li",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"here"),".\nGPT-4 is then asked to generate a SPARQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),", instead of SQL, for the same ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". The full prompt, again copy-pasted\nfrom the paper with some simplifications, looks like this:",(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"Given the OWL model described in the following TTL file:\n\nWrite a SPARQL query that answers the question. Do not explain the query. return just the query, so it can be run verbatim from your response.\nHere\u2019s the question:\n\n")),"As a last step, the authors have a direct mapping from ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," to a SQL query ",(0,s.kt)("span",{parentName:"li",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"})))))))))),". This is a quite straigh-forward step\nas the modeling as an ontology vs relational schema have direct translations from classes and properties to tables and columns.")),(0,s.kt)("p",null,"An interesting comparison. There is some intuition for why one would be interested in the effectiveness of\nquery generation through an ontology because one of the well-known\npre-LLM text-to-SQL papers ",(0,s.kt)("a",{parentName:"p",href:"https://www.vldb.org/pvldb/vol9/p1209-saha.pdf"},"ATHENA")," did something similar.\nInstead of SPARQL they had another query language over an ontology called Ontology Query Language, which\nwas then mapped to SQL. "),(0,s.kt)("p",null,"The results are even more interesting. The authors categorize their 43 questions into\n4 quadrants based on 2 dimensions: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"question")," complexity: Questions that require only simple projections\nare low complexity. Those that require aggregations or math functions are high complexity."),(0,s.kt)("li",{parentName:"ul"},"Low vs high ",(0,s.kt)("strong",{parentName:"li"},"schema")," complexity: Questions whose SQL queries require up to 4 tables are low schema complexity. Those that\nrequire 5 or more joins are high schema complexity. ")),(0,s.kt)("p",null,'The accuracy results are shown below. Accuracy here is "execution accuracy" meaning that only the answers of the queries\nare checked against the ground truth answer. That is, even if the SQL query GPT-4 generated was actually not correct\nbut by luck it computed the correct answers the paper takes it as correct (apparently happens very rarely in this study).'),(0,s.kt)("table",null,(0,s.kt)("thead",{parentName:"table"},(0,s.kt)("tr",{parentName:"thead"},(0,s.kt)("th",{parentName:"tr",align:null},"Overall: 16.7% vs 54.2%"),(0,s.kt)("th",{parentName:"tr",align:null},"Low Schema Complexity"),(0,s.kt)("th",{parentName:"tr",align:null},"High Schema Complexity"))),(0,s.kt)("tbody",{parentName:"table"},(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"Low Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"37.4% vs 66.9%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 38.7%")),(0,s.kt)("tr",{parentName:"tbody"},(0,s.kt)("td",{parentName:"tr",align:null},(0,s.kt)("b",null,"High Question Complexity")),(0,s.kt)("td",{parentName:"tr",align:null},"25.5% vs 71.1%"),(0,s.kt)("td",{parentName:"tr",align:null},"0% vs 35.7%")))),(0,s.kt)("p",null,"Overall, the indirect SQL generation method through SPARQL is much more effective in this zero-shot setting.\nNot surprisingly, questions that require 5 or more joins are harder regardless of the\nmethod used and direct SQL cannot get any of those questions right. These are interesting\nresults for an initial study on the effects of data modeling on LLMs' accuracy on generating database queries.\nThese results should give many researchers and practitioners ideas about how to replicate\nand validate/invalidate similar results under different settings, e.g., with few-shot\nexamples and under different databases."),(0,s.kt)("p",null,(0,s.kt)("strong",{parentName:"p"},"That said, one should ask, why?")," In fact, we should all be suspicious that merely modeling the\nsame set of records with a different abstraction should have any visible effects. After all, by modeling\nthe same records differently, one does not obtain or lose information. So if and when LLMs are smart enough,\nthey shouldn't care how the data was modeled. But for now, if a pound sign can make a difference,\nwe should not be surprised modeling choices can have large impacts. As such, it is healthy to be suspicious\nand ask why. These motivate a few important questions I think are worth studying. My premise\nis that somehow if the differences are this large, it must be that the task for GPT-4 got simpler when\nasked to generate a SPARQL query. I can hypothesize about a few possible reasons for this: "),(0,s.kt)("ul",null,(0,s.kt)("li",{parentName:"ul"},(0,s.kt)("p",{parentName:"li"},(0,s.kt)("em",{parentName:"p"},"Some queries require fewer tokens to write in SPARQL"),": One difference the query languages\nof GDBMSs often have is that certain equality conditions are implicit in the syntax, which\nmeans their ",(0,s.kt)("inlineCode",{parentName:"p"},"WHERE")," clauses are simpler for some queries. For example if you wanted to return\nthe names of the Catastrophe that Claim with ID Claim1 has, in SPARQL you can write it as:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?name\nWHERE { in:hasCatastrophe ?catastrophe,\n ?catastophe in:catastropheName ?name}\n")),(0,s.kt)("p",{parentName:"li"},"In SQL you would write:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Catastrophe_Name\nFROM Claim, Catastrophe\nWHERE Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier AND\n Claim.Claim_Identifier = Claim1\n")),(0,s.kt)("p",{parentName:"li"},"Note that the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Claim_Identifier = Claim1")," equality condition is implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"}," in:hasCatastrophe ?catastrophe")," triple\nand the ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier")," condition is implicit in the fact that ",(0,s.kt)("inlineCode",{parentName:"p"},"?catastrophe")," appears\nboth in the first and second triples in the SPARQL query. Such implicit equality conditions are common in the languages of\ngraph query languages especially when expressing joins. For example in Cypher you can omit all join conditions in WHERE clauses as long\nas those joins have been pre-defined to the system as relationships. Instead you join records through the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," syntax.\nIt's unclear how much this could matter but it is an immediate advantage of SPARQL that can explain why complex join queries are easier to generate\nin SPARQL than SQL. "),(0,s.kt)("p",{parentName:"li"},(0,s.kt)("strong",{parentName:"p"},"Side note"),": On the flip side, SPARQL can be more verbose in projections. For example, if you wanted to return the number, open and close\ndates of every claim, you'd write the following SQL query:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT Claim_Number, Claim_Open_Date, Claim_Close_Date\nFROM Claim\n")),(0,s.kt)("p",{parentName:"li"},"In SPARQL, you'd have to write both the names of the property you want to project and give it an additional variable as follows:"),(0,s.kt)("pre",{parentName:"li"},(0,s.kt)("code",{parentName:"pre"},"SELECT ?number, ?open_date, ?close_date\nWHERE { ?claim in:claimNumber ?number,\n ?claim in:claimOpenDate ?open_date,\n ?claim in:claimCloseDate ?close_date\n")))),(0,s.kt)("ol",{start:2},(0,s.kt)("li",{parentName:"ol"},(0,s.kt)("em",{parentName:"li"},"Graph modeling gives explicit names to foreign keys:")," There is a reason that database courses teach database modeling to students\nusing graph-based models, such as Entity-Relationship or UML models. First, humans think of the world\nas objects/entities and their relationships. In some sense, these are higher-level models where relationships\nbetween objects are denoted explicitly with explicit names (instead of as less explicit foreign key constraints).\nFor example, the implicit connection between Claims and\nCatastrophes through the ",(0,s.kt)("inlineCode",{parentName:"li"},"FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier)"),"\nconstraint was given an explicit English name: ",(0,s.kt)("inlineCode",{parentName:"li"},"hasCatastrophe")," in the ontology. This explicitness may make\nit easier for LLMs to understand the schema and generate SPARQL queries.")),(0,s.kt)("p",null,"Both of these are qualitative hypotheses. however, there is a more immediate\nreason the authors of this paper may have obtained such major differences between the two approaches they tried.\nIntentionally or unintentionally, their ontology is simplified significantly compared to the relational schema they have.\nFor example, the Claim relation has ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Reopen_Date")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Claim_Status_Code")," properties which are removed from the ontology.\nMany such properties from the relations seem to have been removed, and the ontology overall looks simpler.\nThere are also several differences between the ontology and the relational schema that are confusing. For example\nthe ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl"},"ontology"),"\nhas a class ",(0,s.kt)("inlineCode",{parentName:"p"},"Agent")," and ",(0,s.kt)("inlineCode",{parentName:"p"},"Policy")," objects are ",(0,s.kt)("inlineCode",{parentName:"p"},"in:soldByAgent")," by some Agent objects (see lines 20 and 92). I cannot\nsee corresponding relations or columns in the ",(0,s.kt)("a",{parentName:"p",href:"https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl"},"relational schema"),". Unless I am missing something about how the prompts were given,\nthese are also likely to have important effects on the results and someone should fix and obtain new results\nin a more fair comparison."),(0,s.kt)("p",null,"Let me next raise several high-level questions that I think are important:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2: Rules of thumbs in data modeling to make LLM-generated queries more accurate."),"\nI think the higher-level question of studying the effects of data modeling in more depth is a very good direction.\nAs LLMs get smarter, I would expect that the presence/absence of a pound sign or the style of English\nshould matter less. These look more like syntactic differences that can be automatically detected over time.\nModeling choices are more fundamental and relate to the clarity and understandibility of the records that will be queried by the LLM.\nSo identifying some rules of thumb here looks like the promising path forward. Let me list a few immediate questions one can study:"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.1: Effects of normalization/denormalization.")," If the shortcoming of GPT-4 is\ngenerating queries with many joins, one way to solve this is to denormalize the relations into fewer\ntables and study its effects. Again, I'm thinking of same records just modeled differently with fewer\ntables. What happens if we reduce all data into a single table with dozens of columns and many value repetitions?\nNow all possible joins would have been performed so we'd force the LLM to write a join-less query with\nfilters, distincts, and aggregations. What happens if we normalize the tables step-by-step until we\nget to a well known form, such as ",(0,s.kt)("a",{parentName:"p",href:"https://en.wikipedia.org/wiki/Boyce%E2%80%93Codd_normal_form"},"Boyce-Codd Normal Form"),"? Do we consistently get better or worse accuracy?"),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 2.2: Use of views.")," In relational modeling, views are an effective way to have higher\nand simpler modeling of your records. Similar to a ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"P"),(0,s.kt)("mi",{parentName:"mrow"},"A"),(0,s.kt)("mi",{parentName:"mrow"},"R"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SPARQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.13889em"}},"SP"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"A"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"RQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline,\none can test the effectiveness of ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"N"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{NL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.8778em",verticalAlign:"-0.1944em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.10903em"}},"N"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.15em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[LLM]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"o"),(0,s.kt)("mi",{parentName:"mrow"},"v"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"r"),(0,s.kt)("mo",{parentName:"mrow"},"\u2212"),(0,s.kt)("mi",{parentName:"mrow"},"V"),(0,s.kt)("mi",{parentName:"mrow"},"i"),(0,s.kt)("mi",{parentName:"mrow"},"e"),(0,s.kt)("mi",{parentName:"mrow"},"w"),(0,s.kt)("mi",{parentName:"mrow"},"s")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL-over-Views}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"o"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.03588em"}},"v"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02778em"}},"er"),(0,s.kt)("span",{parentName:"span",className:"mbin mtight"},"\u2212"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"Vi"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"e"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight",style:{marginRight:"0.02691em"}},"w"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"s"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," -","[Direct Mapping]","-> ",(0,s.kt)("span",{parentName:"p",className:"math math-inline"},(0,s.kt)("span",{parentName:"span",className:"katex"},(0,s.kt)("span",{parentName:"span",className:"katex-mathml"},(0,s.kt)("math",{parentName:"span",xmlns:"http://www.w3.org/1998/Math/MathML"},(0,s.kt)("semantics",{parentName:"math"},(0,s.kt)("mrow",{parentName:"semantics"},(0,s.kt)("msub",{parentName:"mrow"},(0,s.kt)("mi",{parentName:"msub"},"Q"),(0,s.kt)("mrow",{parentName:"msub"},(0,s.kt)("mi",{parentName:"mrow"},"S"),(0,s.kt)("mi",{parentName:"mrow"},"Q"),(0,s.kt)("mi",{parentName:"mrow"},"L")))),(0,s.kt)("annotation",{parentName:"semantics",encoding:"application/x-tex"},"Q_{SQL}")))),(0,s.kt)("span",{parentName:"span",className:"katex-html","aria-hidden":"true"},(0,s.kt)("span",{parentName:"span",className:"base"},(0,s.kt)("span",{parentName:"span",className:"strut",style:{height:"0.9694em",verticalAlign:"-0.2861em"}}),(0,s.kt)("span",{parentName:"span",className:"mord"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal"},"Q"),(0,s.kt)("span",{parentName:"span",className:"msupsub"},(0,s.kt)("span",{parentName:"span",className:"vlist-t vlist-t2"},(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.3283em"}},(0,s.kt)("span",{parentName:"span",style:{top:"-2.55em",marginLeft:"0em",marginRight:"0.05em"}},(0,s.kt)("span",{parentName:"span",className:"pstrut",style:{height:"2.7em"}}),(0,s.kt)("span",{parentName:"span",className:"sizing reset-size6 size3 mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mtight"},(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"SQ"),(0,s.kt)("span",{parentName:"span",className:"mord mathnormal mtight"},"L"))))),(0,s.kt)("span",{parentName:"span",className:"vlist-s"},"\u200b")),(0,s.kt)("span",{parentName:"span",className:"vlist-r"},(0,s.kt)("span",{parentName:"span",className:"vlist",style:{height:"0.2861em"}},(0,s.kt)("span",{parentName:"span"}))))))))))," pipeline."),(0,s.kt)("p",null,(0,s.kt)("em",{parentName:"p"},"Important Future Work 3: Use of Cypher as intermediate query language to translate to SQL.")," One reason to experiment with Cypher\nin addition to SPARQL is that Cypher is, arguably, more similar to SQL than SPARQL but has the advantage that (common) join\nconditions are implicit in the ",(0,s.kt)("inlineCode",{parentName:"p"},"(a)-[e]->(b)")," node-arrow syntax. Yet Cypher does not have the verbosity of the SPARQL projections\nI mentioned above (so you project properties the same way you project columns in SQL). In my world, all high-level query languages\nlook very similar to SQL, so eventually when LLMs are smart enough, or even today, I think these language differences\nshould have minor effects. However, graph query languages will likely continue to have major advantages when writing\nrecursive queries, as they have specialized syntax (e.g., Cypher has the Kleene star syntax) to do so. For those queries,\nexpressing first in Cypher and then mapping to SQL could lead to an advantage. "),(0,s.kt)("h2",{id:"final-words"},"Final Words"),(0,s.kt)("p",null,"Needless to say, in the next few years, the field will be flooded with work on how to\nuse LLMs to solve the text-to-high-level-query problem. Many rules of thumb will emerge\nabout how to prompt them correctly. The questions one can ask in this space is endless.\nI can speculate about it a lot, but I think it's plausible that\nmany of these rules of thumb, specifically the syntactic\ndifferences in prompting, can become\nobsolete very quickly as newer and more advanced LLMs that are better at speaking high-level database languages emerge.\nFor example, it's plausible that people will stop showing LLMs example (question, query) pairs each time they ask them to generate\nSQL once LLMs are better at speaking SQL."),(0,s.kt)("p",null,"However, the harder question of how to model the data so that its meaning is clear, and the\nqueries that need to be written, are simpler, is more likely to remain a challenge for a longer time. I would not be too optimistic\nthat there can emerge very clear answers to this question. How to model your data is part-art and part-science.\nYet, some studiable questions, such as the effects of normalization, use of views or generating Cypher for recursive queries,\ncan yield some important best practices that can be useful to developers building these systems."),(0,s.kt)("p",null,"In the next post, I will cover what I learned about RAG over unstructured data. Graphs and knowledge graphs are playing\na more interesting role in that space. Until then, happy new year to all!"),(0,s.kt)("div",{className:"footnotes"},(0,s.kt)("hr",{parentName:"div"}),(0,s.kt)("ol",{parentName:"div"},(0,s.kt)("li",{parentName:"ol",id:"fn-1-a733ad"},"SPARQL syntax is different but a similar advantage exists by omitting type constraints.",(0,s.kt)("a",{parentName:"li",href:"#fnref-1-a733ad",className:"footnote-backref"},"\u21a9")))))}d.isMDXComponent=!0},6477:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/qa-over-enterprise-data-c63fb036791e4a0ec1f24d802d50254e.png"},2045:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/rag-using-structured-data-4e7c4e780c6a85b5664a3aa3f3f6c5a9.png"},4287:(a,e,t)=>{t.d(e,{Z:()=>n});const n=t.p+"assets/images/two-sql-generation-approaches-cceca4532513a6bb7ccae29e3f3ca94f.png"}}]); \ No newline at end of file diff --git a/docusaurus/assets/js/e506623f.83c9d7bd.js b/docusaurus/assets/js/e506623f.83c9d7bd.js deleted file mode 100644 index a952deb84..000000000 --- a/docusaurus/assets/js/e506623f.83c9d7bd.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkkuzu_docs=self.webpackChunkkuzu_docs||[]).push([[4922],{9041:e=>{e.exports=JSON.parse('{"blogPosts":[{"id":"llms-graphs-part-1","metadata":{"permalink":"/docusaurus/blog/llms-graphs-part-1","source":"@site/blog/2024-01-04-llms-graphs-part-1/index.md","title":"RAG Using Structured Data: Overview & Important Questions","description":"During the holiday season, I did some reading on","date":"2024-01-04T00:00:00.000Z","formattedDate":"January 4, 2024","tags":[{"label":"use-case","permalink":"/docusaurus/blog/tags/use-case"}],"readingTime":25.13,"hasTruncateMarker":false,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"llms-graphs-part-1","authors":["semih"],"tags":["use-case"]},"nextItem":{"title":"K\xf9zu 0.1.0 Release","permalink":"/docusaurus/blog/kuzu-0.1.0-release"}},"content":"import QAOverEnterpriseData from \'./qa-over-enterprise-data.png\';\\nimport RAGUsingStructuredData from \'./rag-using-structured-data.png\';\\nimport TwoSQLGenerationApproaches from \'./two-sql-generation-approaches.png\';\\n\\n\\n\\nDuring the holiday season, I did some reading on\\nLLMs and specifically on the techniques that use LLMs together with graph databases and knowledge graphs.\\nIf you are new to the area like me, the amount of activity on this topic on social\\nmedia as well as in research publications may have intimidated you. \\nIf so, you\'re exactly my target audience for this new blog post series I am starting.\\nMy goals are two-fold: \\n1. *Overview the area*: I want to present what I learned with a simple and consistent terminology and at\\na more technical depth than you might find in other blog posts. I am aiming a depth similar to what I aim when preparing\\na lecture. I will link to many quality and technically satisfying pieces of content (mainly papers since the area is very researchy).\\n2. *Overview important future work*: I want to cover several important future works in the space. I don\'t\\nnecessarily mean work for research contributions but also simple approaches to experiment with if you are\\nbuilding question answering (Q&A) applications using LLMs and graph technology.\\n\\n## Killer App: Retrieval Augmented Generation\\n\\nLet\'s review the killer application of LLMs in enterprises.\\nThe application is ultimately Q&A over private enterprise data. Think of a chatbot to which you \\ncan ask natural language questions ($Q_{NL}$), such as: \\"Who is our top paying customer from Waterloo?\\",\\nor \\"What are data privacy regulations in Canada we need to comply with?\\"\\nand get back natural language answers ($A_{NL}$).\\nLLMs, out of the box, cannot answer these questions because they have a *knowledge gap*.\\nFor example, LLMs never had any access to your sales records when they were trained. \\nTherefore, they need to retrieve or be provided with \\nextra information from private data sources of the enterprise.\\n\\n### A note on the term RAG\\nThere seems to be tremendous interest in building systems that combine a traditional \\ninformation retrieval component, e.g., one that looks up some documents from\\nan index, with a natural language generator component, such as an LLM. The term for such systems is \\n*Retrieval Augmented Generation* (RAG).\\nThe term is coined in [this paper](https://arxiv.org/pdf/2005.11401.pdf) to refer\\nto the method of fine-tuning an LLM with additional information, i.e.,\\nusing this additional data to train a new variant of the LLM. \\nThe original usage form in the paper is \\"RAG models\\". Nowadays it is used in a variety of ways, \\nsuch as, \\"RAG system\\", \\"RAG-based system\\", \\"RAG does X\\", or \\n\\"Building RAG with Y\\". RAG often does not refer to fine-tuning LLMs any more. Instead, it \\nrefers to providing LLMs with private data along with the question to fix the knowledge gap.\\nEven systems that simply use an LLM to convert a \\n$Q_{NL}$ to SQL or Cypher query and simply return the results of the query\\nare called \\"RAG systems\\" in some documentations. I will use the term in this broader sense.\\n\\nYou can build RAG-based Q&A systems by using structured and/or unstructured\\ndata. The high-level views of these systems look like this:\\n\\n
\\n\\n
\\n\\n\x3c!---\\nIn this blog post series I will cover the following approaches:\\n1. RAG using structured data: Uses structured records in the enterprise, e.g.,\\nrecords stored in relational or graph DBMSs. \\n1. RAG using unstructured data: Uses text files, pdfs, or other unstructured documents, such as html pages.\\n2. RAG using a mix of structured and unstructured data.\\n---\x3e\\n\\n### Summary of this post\\n\\nThis post covers RAG using structured data. Then, in a follow up post, I will cover RAG using unstructured data, where\\nI will also mention a few ways people are building RAG-based Q&A\xa0systems that use both structured and unstructured data.\\n\\n:::tip TL;DR: The key takeaways from this post are:\\n- **RAG overview**: RAG is a technique to fill the knowledge gap of LLMs using private data. RAG systems\\n use private structured records stored in a database and/or unstructured data in text files. \\n- **Impressive simplicity and effectiveness of developing a natural language interface over your database using LLMs**: In the pre-LLM era, the amount of engineering effort\\n to develop a pipeline that delivered a natural language interface over your database was *immense*. The\\n hard problem was to teach a model to *speak* SQL, Cypher, or SPARQL.\\n This contrasts sharply with the simplicity of developing similar pipelines now because LLMs already \\"speak\\" these languages. \\n The hard task now is for *developers to learn how to prompt LLMs* to get correct database queries. Furthermore, there is\\n evidence that LLMs, if prompted correctly, will generate a decent proportion of queries with impressive accuracy. \\n- **Lack of work that studies LLMs\' ability to generate Cypher or SPARQL:** Most technically-deep work on understanding\\n LLMs\' ability to generate accurate high-level query languages is on SQL. We need more\\n work understanding the behavior of LLMs on the query languages of GDBMSs (like Cypher or SPARQL), specifically on recursive and union-of-join queries.\\n- **Studying the effects of data modeling (normalization, views, graph modeling) on the accuracy of LLM-generated queries is important:**\\n Many people are studying heuristics for prompting LLMs to increase their efficiency focusing on the syntax and the structure of providing\\n the schema and selection of examples in the prompt. An important and under-studied\\n problem is the effects of data modeling choices on the accuracy of the queries generated by LLMs. I point to [one interesting paper](https://arxiv.org/pdf/2311.07509.pdf) in this space and raise several questions related to\\n normalizations and use of views in relational modeling and comparisons with graph modeling approaches. \\n:::\\n\\n## RAG Using Structured Data: Text-to-High-level-Query\\n*Note: If you are familiar with how to develop RAG systems with LangChain and LlamaIndex, you can directly skip\\nto the \\"[How Good are LLMs in Generating High-level Queries](#how-good-are-llms-in-generating-high-level-queries)\\" part that \\nreflects on the reading I did on RAG using structured data.*\\n\\n### Overview\\nMany blog posts and several papers concern Q&A systems that simply convert\\n$Q_{NL}$ to a high-level query languge, such as SQL, Cypher, or SPARQL, using an LLM.\\nThe figure below describes the overall approach:\\n\\n
\\n\\n
\\n\\n$Q_{NL}$, the schema of a database, and optionally\\nsome example natural language question and high-level query examples, are given\\nto the LLM as a prompt. \\nThe terms \\"no shot\\", \\"one shot\\", or \\"few shot\\" refer to the number of examples provided\\nin the prompt. Depending on the underlying database, the schema may contain\\ncolumns of relational tables and their descriptions, or labels of nodes and edges\\nof a graph database. Using $Q_{NL}$, the database schema, and optionally\\nsome examples, the LLM generates \\na database query, such as SQL or Cypher. The system runs this query against the\\nDBMS and returns back the query result or using the LLM again, converts \\nthe query result back to a natural language answer $A_{NL}$. \\n\\n**Let us pause here to appreciate one thing:** For many decades, the database community has studied the problem\\nof converting $Q_{NL}$ to SQL (aka \\"text-to-SQL\\"). Here is a good recent [survey paper](https://link.springer.com/article/10.1007/s00778-022-00776-8)\\nthat covers only the deep network-based approaches and [a more extensive survey/book](https://www.nowpublishers.com/article/Details/DBS-078)\\non the broader topic of natural language interfaces to databases.\\nNeither of these surveys cover any work that directly uses LLMs such as GPT models, \\nwhich are quite recent developments. Take any of the work covered in these surveys and \\nyou\'ll find an approach that requires significant engineering to build the pipeline shown in the above figure. \\nThere exist several pre-LLM text-to-SQL systems (e.g., [ATHENA](https://www.vldb.org/pvldb/vol9/p1209-saha.pdf)\\nor [BELA](https://download.hrz.tu-darmstadt.de/pub/FB20/Dekanat/Publikationen/UKP/76500354.pdf)). \\nFor example, most of the pre-LLM approaches that use deep learning require\\nhard work *to teach a model how to \\"speak\\" SQL* using large \\ncorpora of tables and (question, query) examples, such as [WikiSQL](https://arxiv.org/abs/1709.00103) or [Spider](https://github.com/taoyds/spider).\\nPeople had to solve and glue-together solutions to many technical problems, such as parsing the question,\\nentity detection, synonym finding, string similarity, among others. \\nPost-LLM approaches require *none* of these efforts because LLMs, such as GPT-4, already speak SQL, Cypher, and SPARQL out of the box, having been exposed to them in their pretraining. \\nNowadays, the hard problem now is for developers *to learn how to prompt LLMs* so that \\nLLMs generate correct queries. I\'ll say more about this problem. In contrast, building the above pipeline requires much less effort as\\nI\'ll show next.\\n\\n### Simplicity of Developing RAG Systems: LangChain and LlamaIndex\\nIf you have been following the developments in the LLM space, you will not be surprised to hear that nowadays people build \\nQ&A systems that convert $Q_{NL}$ to a high-level query language using two common tools:\\n(i) [LangChain](https://www.langchain.com/); and (ii) [LlamaIndex](https://www.llamaindex.ai/).\\nThe same tools also integrate with the underlying storage system to load and retrieve your data. To make this more concrete, let me review the [K\xf9zu-LangChain integration](https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa), similar to the integrations found in other GDBMSs. You as a programmer have very little to do: you prepare your K\xf9zu\\ndatabase `db` and load your data into it, wrap it around a `KuzuGraph` and `KuzuQAChain` objects in Python and you have\\na text-to-Cypher pipeline:\\n\\n```python\\nimport kuzu\\nfrom langchain.chains import KuzuQAChain\\nfrom langchain_community.chat_models import ChatOpenAI\\nfrom langchain_community.graphs import KuzuGraph\\n\\ndb = kuzu.Database(\\"test_db\\")\\n... // create your graph if needed\\ngraph = KuzuGraph(db)\\nchain = KuzuQAChain.from_llm(ChatOpenAI(temperature=0), graph=graph, verbose=True)\\nchain.run(\\"Who played in The Godfather: Part II?\\")\\n```\\nI am following the example application in this [documentation](https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa), \\nwhich uses a database of movies, actors, and directors. \\n\\n```bash\\nOutput:\\n> Entering new chain...\\nGenerated Cypher:\\nMATCH (p:Person)-[:ActedIn]->(m:Movie {name: \'The Godfather: Part II\'}) RETURN p.name\\nFull Context:\\n[{\'p.name\': \'Al Pacino\'}, {\'p.name\': \'Robert De Niro\'}]\\n\\n> Finished chain.\\n\\n\'Al Pacino and Robert De Niro both played in The Godfather: Part II.\'\\n```\\nThe \\"chain\\" first generated a Cypher query using $Q_{NL}$. \\nBehind the curtain, i.e., inside the KuzuQAChain code, \\na GPT model was given the following prompt:\\n\\n```bash\\nGenerate Cypher statement to query a graph database.\\nInstructions:\\nUse only the provided relationship types and properties in the schema.\\nDo not use any other relationship types or properties that are not provided.\\n\\nSchema:\\nNode properties: [{\'properties\': [(\'name\', \'STRING\')], \'label\': \'Movie\'}, {\'properties\': [(\'name\', \'STRING\'), (\'birthDate\', \'STRING\')], \'label\': \'Person\'}]\\nRelationships properties: [{\'properties\': [], \'label\': \'ActedIn\'}]\\nRelationships: [\'(:Person)-[:ActedIn]->(:Movie)\']\\n\\nNote: Do not include any explanations or apologies in your responses.\\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\\nDo not include any text except the generated Cypher statement.\\n\\nThe question is:\\nWho played in The Godfather: Part II?\\n```\\n\\nIndeed, if you copy this prompt and paste it in [chatGPT\'s browser interface](https://chat.openai.com/), \\nyou will get the same or very similar Cypher query. The important point is: that\'s all\\nthe coding you have to do to build a natural language interface that can query your database. \\nYou ultimately construct a string prompt that contains $Q_{NL}$, some\\ninstructions, and schema of the database, and the LLM will generate a query for you. \\nThe `KuzuGraph` and `KuzuQAChain` are simple wrappers to do just that.\\nIf you want to play around with how well this works on other datasets,\\nwe have this pipeline implemented in K\xf9zu\'s browser frontend [K\xf9zuExplorer](https://kuzudb.com/docusaurus/kuzuexplorer/). \\n\\nThat is, for any database you have in K\xf9zu, you get a natural language interface over it in\\nK\xf9zuExplorer (just click the \\"robot icon\\" on the left panel). \\nYou can develop similar pipelines with other GDBMSs using similar interfaces (*though I recommend using K\xf9zu as it will be the\\nsimplest to get started* \ud83d\ude09: *Unlike other GDBMSs, K\xf9zu is embeddable and requires no server set up*).\\nIf you instead want to build Q&A systems over your RDBMSs, you can use\\nLangChain\'s [SQLDatabaseChain](https://python.langchain.com/docs/use_cases/qa_structured/sql#case-2-text-to-sql-query-and-execution) and \\n[SQLAgent](https://python.langchain.com/docs/use_cases/qa_structured/sql#case-3-sql-agents) or\\nLlamaIndex\'s [NLSQLTableQueryEngine](https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/SQLIndexDemo.html#part-1-text-to-sql-query-engine). The level of simplicity is similar to the example I presented. In practice, it is unlikely that your chatbot or search engine will be as simple\\nas the above example where the application interacts with the LLM only once. If you want\\nto interact with the LLM multiple times and conditionally take one action over another action etc.,\\nLangChain and LlamaIndex also provide ways to do that through their \\"Agents\\" (see [LangChain Agents](https://python.langchain.com/docs/modules/agents/) and [Llama Index Agents](https://docs.llamaindex.ai/en/stable/use_cases/agents.html)).\\n\\n\\n### How Good Are LLMs in Generating High-Level Queries?\\nAlthough building a text-to-high-level-query-language pipeline is now very simple with LLMs,\\nsimplicity **does not** mean quality. Indeed, people building these systems are now faced with the following two important questions: \\n\\n1. *How accurate are the high-level queries that LLMs generate?*\\n2. *How, e.g., through what types of prompts or data modeling, can we increase the accuracy of the\\nqueries generated by LLMs?*\\n\\nHere are several papers on this that I suggest reading:\\n1. *[A comprehensive evaluation of ChatGPT\u2019s zero-shot Text-to-SQL capability](https://arxiv.org/pdf/2303.13547.pdf)* from Tsinghua University and University of Illinois at Chicago. \\n2. *[Evaluating the Text-to-SQL Capabilities of Large Language Models](https://arxiv.org/pdf/2204.00498.pdf)* from researchers from Cambridge and universities and institutes from Montr\xe9al.\\n3. *[Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation](https://arxiv.org/pdf/2308.15363.pdf)* from Alibaba Group.\\n4. *[Enhancing Few-shot Text-to-SQL Capabilities of Large Language Models: A Study on Prompt Design Strategies](https://arxiv.org/pdf/2305.12586.pdf)* from Yale, Columbia, and Allen Institute for AI.\\n5. *[How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings](https://arxiv.org/pdf/2305.11853.pdf)* from Ohio State\\n6. *[A Benchmark to Understand the Role of Knowledge Graphs on LLM\'s Accuracy for Q&A on Enterprise SQL Databases](https://arxiv.org/pdf/2311.07509.pdf)* from data.world.\\n\\nThese papers are either entirely or *almost* entirely evaluation-only papers that experiment with very detailed approaches of prompting LLMs\\nto generate SQL queries. First, let me say that the general message these\\npapers give (maybe except the last one) is that LLMs are pretty good. With right prompting (or even with basic prompting)\\nthey do very well on these benchmarks. I see accuracy rates over 85% on the Spider benchmark in several papers. These are clearly\\nbetter numbers than what pre-LLM state-of-the-art systems achieved. This should be impressive to many.\\n\\nSecond, the set of techniques are too detailed to cover here but some example heuristics \\nthese papers experiment with include the following: (i) the syntax used for providing the schema \\n(apparently putting \\"the pound sign `#` to differentiate prompt from response in examples yields impressive performance gains\\" \ud83d\ude00 go figure); (ii)\\nthe number and selection of example (question, SQL) pairs, e.g., apparently there is a sweet spot in the number\\nof examples to provide; or (iii) the effects of standardizing the text in the prompt, e.g., indenting and using all lower case letters consistently\\n(apparently has minor but some effect). Yes, as interesting and important it is to learn how to use LLMs better, I still \\ncan\'t escape the following thought before going to bed: somewhere out there, some advisor might be torturing some graduate student\\nto check if the magical box produces better SQL with a pound sign vs double slashes!\\n\\nMost work I found is on generating SQL.\\nIn contrast, I found no papers that do similar prompting study for query languages\\nof GDBMS though I ran into two papers that are providing benchmarks for query languages of GDBMSs: \\n(i) [SPARQL](https://arxiv.org/abs/2309.16248); and (ii) [Cypher](https://dl.acm.org/doi/pdf/10.1145/3511808.3557703)).\\nSo a low-hanging fruit future work is the following:\\n\\n*Important Future Work 1: Similar prompting studies for query languages of graph DBMSs with a focus on recursive and unions of joins queries.*: \\nIn contrast to SQL queries, here, one should study various recursive queries that the query languages of GDBMSs are particularly good\\nat and union-of-join queries which are asked by omitting labels in the query languages of GDBMSs. \\nFor example if you want to ask all connections between\\nyour `User` nodes and User can have many relationships, such as `Follows`, `SentMoneyTo`, or `SameFamily`,\\nyou would have to write 3 possible join queries in SQL and union them. Instead, you can write this query\\nwith a very simple syntax in Cypher as \\n`MATCH (a:User)-[e]->(b:User)`, where the omissions of the label on the relationship `e` indicates searching over\\nall possible joins.[^1] \\n\\n[^1]: SPARQL syntax is different but a similar advantage exists by omitting type constraints.\\n\\nAs a side note: In the context of any query language, including SQL, questions that require sub-queries are of particular \\ninterest as they are generally harder to write. Some of the papers I read had sections analyzing the performance of\\nLLMs on nested queries but the focus was not on these. In prior literature there are papers written solely on text-to-SQL generation for\\nnested queries (e.g., see [the ATHENA++ paper](https://www.vldb.org/pvldb/vol13/p2747-sen.pdf)). I am certain someone\\nsomewhere is already focusing solely on nested queries and that\'s a good idea.\\n\\n## data.world Paper and Some Interesting Questions\\nIn the remainder of the post I want to review [the benchmark paper](https://arxiv.org/pdf/2311.07509.pdf) from `data.world` that focuses on text-to-SQL using LLMs. Unlike other papers out there that \\nstudy the effects of different prompting heuristics, this paper studies the *effects of data modeling \\non the accuracy of SQL queries generated by LLMs*, which is closely related to GDBMSs. \\n\\nSpecifically, this paper is an evaluation of the performance of GPT-4 in generating SQL using no examples, i.e., zero-shot,\\nwith basic prompting over a standardized insurance database schema \\ncalled The [OMG Property and Casualty Data Model](https://www.omg.org/spec/PC/1.0/About-PC). \\nSee Figure 1 in the paper (omitted here) for the conceptual schema, which consists of classes such as \\nPolicy, Account, Claims, Insurable Object, among others, and their relationships.\\nThe paper has a benchmark of 43 natural language questions and compares 2 approaches to generate the SQL query.\\nThe below figure shows an overview of these approaches for reference:\\n\\n
\\n\\n
\\n\\n1. Direct SQL Generation: In this approach, $Q_{NL}$ and the relational schema of the OMG database is given\\n to GPT-4. The schema is given in terms of `CREATE TABLE` statements, such as:\\n ```sql\\n CREATE TABLE Claim(\\n Claim_Identifier int NOT NULL,\\n Catastrophe_Identifier int NULL,\\n ...\\n Claim_Open_Date datetime NULL ,\\n\\t ...\\n\\t PRIMARY KEY (Claim_Identifier ASC),\\n\\t FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier),\\n ...)\\n ```\\n The full schema statements can be found [here](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl).\\n GPT-4 is asked to generate a SQL query $Q_{SQL}$ to answer $Q_{NL}$.\\n Copy-pasted from the paper, these prompts look as follows:\\n ```\\n Given the database described by the following DDL:\\n \\n Write a SQL query that answers the following question. Do not explain the query. return just the query, so it can be run\\n verbatim from your response.\\n Here\u2019s the question:\\n \\n ```\\n2. Indirect SQL Generation via Graph Modeling/SPARQL: In this approach, instead of the relational schema of the database, the same\\n database is modeled as an *[OWL ontology](https://www.w3.org/OWL/)* (OWL is short for Web Ontology Language).\\n Ontology is another term for schema when modeling data as graph as classes and relationships between them. OWL is a W3C standard\\n and part of the RDF technology stack so OWL ontologies are expressed as a set RDF triples, such as:\\n ```\\n ...\\n in:Claim rdf:type owl:Class ;\\n rdfs:isDefinedBy ;\\n rdfs:label \\"Claim\\" .\\n in:claimOpenDate rdf:type owl:DatatypeProperty ;\\n rdfs:domain in:Claim ;\\n rdfs:range xsd:dateTime ;\\n rdfs:isDefinedBy ;\\n rdfs:label \\"Claim Open Date\\" .\\n in:hasCatastrophe rdf:type owl:ObjectProperty ;\\n rdfs:domain in:Claim ;\\n rdfs:range in:Catastrophe ;\\n rdfs:isDefinedBy ;\\n rdfs:label \\"has catastrophe\\" .\\n ...\\n ```\\n The full ontology can be found [here](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl).\\n GPT-4 is then asked to generate a SPARQL query $Q_{SPARQL}$, instead of SQL, for the same $Q_{NL}$. The full prompt, again copy-pasted\\n from the paper with some simplifications, looks like this:\\n ```\\n Given the OWL model described in the following TTL file:\\n \\n Write a SPARQL query that answers the question. Do not explain the query. return just the query, so it can be run verbatim from your response.\\n Here\u2019s the question:\\n \\n ```\\n As a last step, the authors have a direct mapping from $Q_{SPARQL}$ to a SQL query $Q_{SQL}$. This is a quite straigh-forward step\\n as the modeling as an ontology vs relational schema have direct translations from classes and properties to tables and columns.\\n\\nAn interesting comparison. There is some intuition for why one would be interested in the effectiveness of\\nquery generation through an ontology because one of the well-known \\npre-LLM text-to-SQL papers [ATHENA](https://www.vldb.org/pvldb/vol9/p1209-saha.pdf) did something similar.\\nInstead of SPARQL they had another query language over an ontology called Ontology Query Language, which\\nwas then mapped to SQL. \\n\\nThe results are even more interesting. The authors categorize their 43 questions into\\n4 quadrants based on 2 dimensions: \\n- Low vs high **question** complexity: Questions that require only simple projections\\nare low complexity. Those that require aggregations or math functions are high complexity.\\n- Low vs high **schema** complexity: Questions whose SQL queries require up to 4 tables are low schema complexity. Those that\\n require 5 or more joins are high schema complexity. \\n\\nThe accuracy results are shown below. Accuracy here is \\"execution accuracy\\" meaning that only the answers of the queries\\nare checked against the ground truth answer. That is, even if the SQL query GPT-4 generated was actually not correct \\nbut by luck it computed the correct answers the paper takes it as correct (apparently happens very rarely in this study).\\n\\n| Overall: 16.7% vs 54.2%| Low Schema Complexity | High Schema Complexity |\\n| -------- | -------- | -------- |\\n| Low Question Complexity | 37.4% vs 66.9% | 0% vs 38.7% |\\n| High Question Complexity | 25.5% vs 71.1% | 0% vs 35.7% |\\n\\nOverall, the indirect SQL generation method through SPARQL is much more effective in this zero-shot setting.\\nNot surprisingly, questions that require 5 or more joins are harder regardless of the \\nmethod used and direct SQL cannot get any of those questions right. These are interesting\\nresults for an initial study on the effects of data modeling on LLMs\' accuracy on generating database queries. \\nThese results should give many researchers and practitioners ideas about how to replicate\\nand validate/invalidate similar results under different settings, e.g., with few-shot\\nexamples and under different databases.\\n\\n**That said, one should ask, why?** In fact, we should all be suspicious that merely modeling the\\nsame set of records with a different abstraction should have any visible effects. After all, by modeling\\nthe same records differently, one does not obtain or lose information. So if and when LLMs are smart enough,\\nthey shouldn\'t care how the data was modeled. But for now, if a pound sign can make a difference,\\nwe should not be surprised modeling choices can have large impacts. As such, it is healthy to be suspicious\\nand ask why. These motivate a few important questions I think are worth studying. My premise\\nis that somehow if the differences are this large, it must be that the task for GPT-4 got simpler when\\nasked to generate a SPARQL query. I can hypothesize about a few possible reasons for this: \\n- *Some queries require fewer tokens to write in SPARQL*: One difference the query languages\\n of GDBMSs often have is that certain equality conditions are implicit in the syntax, which\\n means their `WHERE` clauses are simpler for some queries. For example if you wanted to return\\n the names of the Catastrophe that Claim with ID Claim1 has, in SPARQL you can write it as:\\n ```\\n SELECT ?name\\n WHERE { in:hasCatastrophe ?catastrophe,\\n ?catastophe in:catastropheName ?name}\\n ``` \\n In SQL you would write:\\n ```\\n SELECT Catastrophe_Name\\n FROM Claim, Catastrophe\\n WHERE Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier AND\\n Claim.Claim_Identifier = Claim1\\n ```\\n Note that the `Claim.Claim_Identifier = Claim1` equality condition is implicit in the ` in:hasCatastrophe ?catastrophe` triple\\n and the `Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier` condition is implicit in the fact that `?catastrophe` appears\\n both in the first and second triples in the SPARQL query. Such implicit equality conditions are common in the languages of\\n graph query languages especially when expressing joins. For example in Cypher you can omit all join conditions in WHERE clauses as long\\n as those joins have been pre-defined to the system as relationships. Instead you join records through the `(a)-[e]->(b)` syntax.\\n It\'s unclear how much this could matter but it is an immediate advantage of SPARQL that can explain why complex join queries are easier to generate\\n in SPARQL than SQL. \\n\\n **Side note**: On the flip side, SPARQL can be more verbose in projections. For example, if you wanted to return the number, open and close\\n dates of every claim, you\'d write the following SQL query:\\n ```\\n SELECT Claim_Number, Claim_Open_Date, Claim_Close_Date\\n FROM Claim\\n ```\\n In SPARQL, you\'d have to write both the names of the property you want to project and give it an additional variable as follows:\\n ```\\n SELECT ?number, ?open_date, ?close_date\\n WHERE { ?claim in:claimNumber ?number,\\n ?claim in:claimOpenDate ?open_date,\\n ?claim in:claimCloseDate ?close_date\\n ```\\n2. *Graph modeling gives explicit names to foreign keys:* There is a reason that database courses teach database modeling to students\\n using graph-based models, such as Entity-Relationship or UML models. First, humans think of the world\\n as objects/entities and their relationships. In some sense, these are higher-level models where relationships\\n between objects are denoted explicitly with explicit names (instead of as less explicit foreign key constraints).\\n For example, the implicit connection between Claims and\\n Catastrophes through the `FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier)`\\n constraint was given an explicit English name: `hasCatastrophe` in the ontology. This explicitness may make\\n it easier for LLMs to understand the schema and generate SPARQL queries.\\n\\nBoth of these are qualitative hypotheses. however, there is a more immediate\\nreason the authors of this paper may have obtained such major differences between the two approaches they tried.\\nIntentionally or unintentionally, their ontology is simplified significantly compared to the relational schema they have.\\nFor example, the Claim relation has `Claim_Reopen_Date` and `Claim_Status_Code` properties which are removed from the ontology.\\nMany such properties from the relations seem to have been removed, and the ontology overall looks simpler.\\nThere are also several differences between the ontology and the relational schema that are confusing. For example\\nthe [ontology](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl) \\nhas a class `Agent` and `Policy` objects are `in:soldByAgent` by some Agent objects (see lines 20 and 92). I cannot\\nsee corresponding relations or columns in the [relational schema](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl). Unless I am missing something about how the prompts were given, \\nthese are also likely to have important effects on the results and someone should fix and obtain new results\\nin a more fair comparison.\\n\\nLet me next raise several high-level questions that I think are important:\\n\\n*Important Future Work 2: Rules of thumbs in data modeling to make LLM-generated queries more accurate.* \\nI think the higher-level question of studying the effects of data modeling in more depth is a very good direction. \\nAs LLMs get smarter, I would expect that the presence/absence of a pound sign or the style of English \\nshould matter less. These look more like syntactic differences that can be automatically detected over time. \\nModeling choices are more fundamental and relate to the clarity and understandibility of the records that will be queried by the LLM. \\nSo identifying some rules of thumb here looks like the promising path forward. Let me list a few immediate questions one can study:\\n\\n*Important Future Work 2.1: Effects of normalization/denormalization.* If the shortcoming of GPT-4 is \\ngenerating queries with many joins, one way to solve this is to denormalize the relations into fewer\\ntables and study its effects. Again, I\'m thinking of same records just modeled differently with fewer\\ntables. What happens if we reduce all data into a single table with dozens of columns and many value repetitions? \\nNow all possible joins would have been performed so we\'d force the LLM to write a join-less query with\\nfilters, distincts, and aggregations. What happens if we normalize the tables step-by-step until we \\nget to a well known form, such as [Boyce-Codd Normal Form](https://en.wikipedia.org/wiki/Boyce%E2%80%93Codd_normal_form)? Do we consistently get better or worse accuracy?\\n\\n*Important Future Work 2.2: Use of views.* In relational modeling, views are an effective way to have higher \\nand simpler modeling of your records. Similar to a $Q_{NL}$ -[LLM]-> $Q_{SPARQL}$ -[Direct Mapping]-> $Q_{SQL}$ pipeline,\\none can test the effectiveness of $Q_{NL}$ -[LLM]-> $Q_{SQL-over-Views}$ -[Direct Mapping]-> $Q_{SQL}$ pipeline.\\n\\n*Important Future Work 3: Use of Cypher as intermediate query language to translate to SQL.* One reason to experiment with Cypher \\nin addition to SPARQL is that Cypher is, arguably, more similar to SQL than SPARQL but has the advantage that (common) join\\nconditions are implicit in the `(a)-[e]->(b)` node-arrow syntax. Yet Cypher does not have the verbosity of the SPARQL projections \\nI mentioned above (so you project properties the same way you project columns in SQL). In my world, all high-level query languages\\nlook very similar to SQL, so eventually when LLMs are smart enough, or even today, I think these language differences\\nshould have minor effects. However, graph query languages will likely continue to have major advantages when writing\\nrecursive queries, as they have specialized syntax (e.g., Cypher has the Kleene star syntax) to do so. For those queries,\\nexpressing first in Cypher and then mapping to SQL could lead to an advantage. \\n\\n## Final Words\\nNeedless to say, in the next few years, the field will be flooded with work on how to \\nuse LLMs to solve the text-to-high-level-query problem. Many rules of thumb will emerge\\nabout how to prompt them correctly. The questions one can ask in this space is endless.\\nI can speculate about it a lot, but I think it\'s plausible that \\nmany of these rules of thumb, specifically the syntactic\\ndifferences in prompting, can become\\nobsolete very quickly as newer and more advanced LLMs that are better at speaking high-level database languages emerge.\\nFor example, it\'s plausible that people will stop showing LLMs example (question, query) pairs each time they ask them to generate\\nSQL once LLMs are better at speaking SQL.\\n\\nHowever, the harder question of how to model the data so that its meaning is clear, and the\\nqueries that need to be written, are simpler, is more likely to remain a challenge for a longer time. I would not be too optimistic\\nthat there can emerge very clear answers to this question. How to model your data is part-art and part-science. \\nYet, some studiable questions, such as the effects of normalization, use of views or generating Cypher for recursive queries,\\ncan yield some important best practices that can be useful to developers building these systems.\\n\\nIn the next post, I will cover what I learned about RAG over unstructured data. Graphs and knowledge graphs are playing\\na more interesting role in that space. Until then, happy new year to all!"},{"id":"kuzu-0.1.0-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.1.0-release","source":"@site/blog/2023-11-19-kuzu-v-0.1.0.md","title":"K\xf9zu 0.1.0 Release","description":"We are very happy to release K\xf9zu 0.1.0 today! This is a major release with the following set of new features and improvements:","date":"2023-11-19T00:00:00.000Z","formattedDate":"November 19, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":9.075,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.1.0-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"RAG Using Structured Data: Overview & Important Questions","permalink":"/docusaurus/blog/llms-graphs-part-1"},"nextItem":{"title":"K\xf9zu 0.0.12 Release","permalink":"/docusaurus/blog/kuzu-0.0.12-release"}},"content":"We are very happy to release K\xf9zu 0.1.0 today! This is a major release with the following set of new features and improvements:\\n\\n\x3c!--truncate--\x3e\\n\\n## NodeGroup-Based Storage\\n\\nWith this release, we have completed the major features of our NodeGroup-base storage design,\\nwhich was outlined in this [issue](https://github.com/kuzudb/kuzu/issues/1474). The primary goal of this design was to have a\\nstorage design that is conducive to implementing compression and zone maps optimization.\\nConceptually, a NodeGroup is equivalent to a [Parquet RowGroup](https://parquet.apache.org/docs/concepts/), which\\nrepresents a horizontal partition of a table consisting of k many nodes (k=64x2048 for now). Each k nodes\' data are\\nmanaged and compressed as a unit on disk files. In release v0.0.7, we had completed the first part of this design and changed our\\nnode table storage to use NodeGroups. In this release, we have completed the second part of this design and now relationship\\ntables are also stored as NodeGroups. That means we now compress the relationships of k many nodes together.\\n\\nWe also stores all column data in a single file `data.kz` which has significantly reduced the number of database files we now maintain.\\n\\n### String Compression\\n\\nWe have extended our compression to compress strings in the database using dictionary compression.\\nFor each string \\"column chunk\\" (which is a partition of an entire column in a table\\nstoring one NodeGroup\'s values), each string s is\\nstored once in a dictionary, and for each record that has value s, we store a pointer to s.\\nThis design applies when storing string properties on relationship tables.\\nThis is done by using 3 column chunks in total. 2 column chunks store the dictionary as follows. One \\"raw strings\\" column chunk\\nstores all the unique strings in the column chunk one after another, and another \\"offsets column chunk\\" identifies\\nthe beginning indices of each string. Then, one additional \\"index column chunk\\" stores the pointers to the strings\\nas indices to the \\"offsets\\" column to identify the strings.\\nThe offset and index columns are bitpacked in the manner of integer columns.\\n\\n**String Compression Benchmark**\\n\\nHere is a micro-benchmark using the Comment table in LDBC100. To compare the compression rate of each column individually,\\nwe construct a new table Tx for each string column x in the Comment table, e.g., `Browser Used`. Tx consists of the\\ncolumn x and a serial primary key, which allows us to avoid storing any materialized hash index. We report the size of the data.kz file\\nand compare against a previous version v0.0.10 of K\xf9zu.\\n\\n| Column | Version 0.0.10 | Version 0.1.0 | Difference |\\n|---------------|----------------|----------------|------------------|\\n| Browser Used | 4.2 GB | 272 MB | -93.5% |\\n| Content | 9.7 GB | 7.5 GB | -22.7% |\\n| Location IP | 5 GB | 1.6 GB | -68.0% |\\n\\nWe also report the entire LDBC100 database size, including all database files (data.kz, indices, metadata, catalog), of v0.1.0\\nand a slightly older version v0.0.8, which included compression of nodes. So this experiment reports\\nboth improvements that come from storing relationship tables in compressed form as well as\\nstoring strings of both node and relationship tables in compressed form.\\n\\n| Database | Version 0.0.8 | Version 0.1.0 | Difference |\\n|----------|----------------|--------------|----------------|\\n| LDBC100 | 127 GB | 94 GB | -26.0% |\\n\\n\\n### Data Ingestion Improvements\\nMoving our relationship table storage to a NodeGroup-based one also improved our\\ndata ingestion times. The following benchmark reports the loading time of the LDBC100 `likesComment.csv` relationship records.\\nThe file contains 242M records and takes 13 GB in raw CSV format. Below we compare v0.1.0 against v0.0.10 using a machine with\\n2 Intel Xeon Platinum 8175M CPUs, each of which has 48 physical CPU cores. We used 300 GB of the 380GB total RAM during this test.\\n\\n| | Version 0.0.10 | Version 0.1.0 | Difference |\\n|---------|----------------|----------------| ----------------|\\n| 8 threads | 266.8 s | 229.8 s | -13.9% |\\n| 4 threads | 312.5 s | 246.8 s | -21.0%\\n| 2 threads | 446.7 s | 335.6 s | -24.8%\\n| 1 threads | 700.8 s | 581.9 s | -17.0%\\n\\n\\n## New Features\\n\\n### Direct Scans of DataFrames\\nWe now support scanning Pandas DataFrames directly. Consider the following `person` DataFrame\\nthat contains two columns, `id` and `height_in_cm` (only the latter will be used in the example):\\n\\n```\\nid = np.array([0, 2, 3, 5, 7, 11, 13], dtype=np.int64)\\nheight_in_cm = np.array([167, 172, 183, 199, 149, 154, 165], dtype=np.uint32)\\nperson = pd.DataFrame({\'id\': id, \'height\': height_in_cm})\\n```\\nThe query below finds all students who are taller than the average height of the records in the `person` DataFrame:\\n```\\nquery = \'CALL READ_PANDAS(\\"person\\")\\n WITH avg(height / 2.54) as height_in_inch\\n MATCH (s:student)\\n WHERE s.height > height_in_inch\\n RETURN s\'\\nresults = conn.execute(query)\\n```\\n\\nDetails of this feature can be found [here](/cypher/query-clauses/call#read_pandas).\\n\\n### Copy\\nThis release comes with several new features related to Cypher\'s `COPY` clause.\\n\\n#### Copy To Parquet Files\\nQuery results can now be exported to Parquet files.\\n```\\nCOPY ( MATCH (a:Person) RETURN a.* ) TO \\"person.parquet\\";\\n```\\n\\n#### Copy To CSV Files\\nWe added serveral configuration options when exporting to CSV files.\\n```\\nCOPY ( MATCH (a:Person) RETURN a.* ) TO \\"person.csv\\" (delim = \'|\', header=true);\\n```\\n\\nWe also improved the performance of the CSV writer. Below is a micro benchmark of exporting the LDBC100 Comment table to CSV format.\\n```\\nCOPY (MATCH (p:Comment) RETURN p.*) to \u2018comment.csv\u2019;\\n```\\n\\n| | Version 0.0.10 | Version 0.1.0 |\\n|-------------|-----------|-----------|\\n| Runtime | 1239.3s | 104.56s |\\n\\n\\n#### Optional `column_names` Argument in Copy From Statements\\nUsers can now load data to a subset of the columns in a table. Previously, we required that if\\nusers are going to load an empty table T from a file F,\\ne.g., a CSV or Parquet file, then F must contain: (1) as many columns as the columns in T; and (2) in the same order as\\ntable T. Now users can optionally add a `column_names` argument in `COPY FROM` statements,\\nwhich relaxes both of these restrictions: (1) F can now contain a subset of the columns; and (2) in arbitrary\\norder, which needs to be specified in the `column_names` argument. Here is an example:\\n```\\nCREATE NODE TABLE Person (id INT64, name STRING, comment STRING, PRIMARY KEY(id));\\nCOPY Person (name, id) FROM \\"person.csv\\";\\n```\\nThe code above first creates a `Person` table with 3 columns, and then loads two of its columns from a file\\nthat contains `name` and `id` values of the columns respectively.\\nThe third `comment` column in the table will be set to `NULL` for all imported records. The details\\nof this feature can be found [here](/cypher/copy).\\n\\n### Updates\\n\\n#### Detach Delete\\n\\nK\xf9zu now supports Cypher\'s [DETACH DELETE](/cypher/data-manipulation-clauses/delete#detach-delete) clause,\\nwhich deletes a node and all of its relationships together.\\nPreviously users could only use the `DELETE` command, which deleted nodes that had no relationships.\\nFor example, the following query deletes a `User` node with `name` Adam and all of its edges.\\n```\\nMATCH (u:User) WHERE u.name = \'Adam\' DETACH DELETE u;\\n```\\n\\n#### Return Deleted Rows\\n\\n`RETURN` clauses can now return variable bindings that were used in the `DELETE` command. For example,\\nyou can return nodes that were deleted in the previous DELETE statement as follows:\\n```\\nDELETE (a:Person) RETURN a;\\n```\\n\\nDetails of this feature can be found [here](/cypher/data-manipulation-clauses/read-after-update).\\n\\n### Other Changes\\n\\n#### SQL-style Cast Function\\n\\nWe have implemented a SQL-style `cast` function `cast(input, target_type)` to cast values between different\\ntypes. The cast function will convert the `input` argument to the `target_type` if\\ncasting of the input value to the target type is defined. For example:\\n```\\nRETURN cast(\\"[1,2,3]\\", \\"INT[]\\");\\n--------------------------\\n| CAST([1,2,3], INT32[]) |\\n--------------------------\\n| [1,2,3] |\\n--------------------------\\n```\\nAlong with this, we are deprecating our previous way of doing casts with separate functions, e.g., `STRING(1.2)` or `to_int64(\\"32\\")`.\\nDetails of the `cast` function can be found [here](/cypher/expressions/casting).\\n\\n#### Recursive Relationship Node Filter\\n\\nSince v0.0.5 we have supported filtering the intermediate relationships that can bind to\\nrecursive relationships, based on the properties of these intermediate relationships.\\nWith the current release, we now support filtering the intermediate nodes that are bound to recursive relationships.\\nAs we did for filtering intermediate relationships, we adopt Memgraph\'s syntax for this feature as follows:\\n```\\nMATCH p = (a:User)-[:Follows*1..2 (r, n | WHERE n.age > 21)]->(b:User)\\nRETURN p;\\n```\\nThe first variable `r` that is inside the recursive relationship above binds to the intermediate relationships while\\nthe second variable `n` binds to the intermediate nodes. The `|`symbol can be followed with a `WHERE` clause\\nwhere these variables can be used to express a filtering expression. This query finds all 1 to 2-hop paths between\\ntwo `User` nodes where the intermediate nodes of these paths have `age` properties greater than 21.\\nDetails of this feature can be found [here](/cypher/query-clauses/match#filter-variable-length-relationships).\\n\\n#### Count Subquery\\n\\nWe have added support for counting subqueries, which checks the number of matches for the given pattern in the graph.\\nThe output of this counting can be bound to a variable with aliasing. For example, the following query counts the\\nnumber of followers of each user in the graph.\\n```\\nMATCH (a:User)\\nRETURN a.name, COUNT { MATCH (a)<-[:Follows]-(b:User) } AS num_follower\\nORDER BY num_follower;\\n```\\nThe details of count subqueries can be found [here](/cypher/subquery#count-subquery).\\n\\n\\n#### New INT128 Data Type\\n\\nFinally, we now have support for 16-byte signed huge integers.\\n\\n## Development\\n\\n### Nightly Build\\nWe have setup a nightly build pipeline for K\xf9zu users who want to access our latest feature set.\\nHere is how you can use the latest nightly version of K\xf9zu:\\n\\n- For the Python API, the latest nightly version can be installed with `pip install --pre kuzu`.\\n- For the Node.js API, the latest nightly version can be installed with `npm i kuzu@next`.\\n- For the Rust API, the latest nightly version can be found at [crates.io](https://crates.io/crates/kuzu/versions).\\n- For the CLI, C and C++ shared library, and the Java JAR, the latest nightly version can be downloaded from the latest run of [this GitHub Actions pipeline](https://github.com/kuzudb/kuzu/actions/workflows/build-and-deploy.yml).\\n\\n### Reduced Binary Size\\nWith this release, we removed our Apache Arrow dependency, which significantly reduces oure binary size.\\nAdditionally, we now strip the shared library and CLI binaries of the symbols that are not needed by our\\nclient APIs. This further reduces our binary sizes.\\nFor example, on a MacOS arm64 platform, these two improvements achieve the following cumulative binary size reductions:\\n\\n| | Version 0.0.10 | Version 0.1.0 |\\n|-------------|-----------|-----------|\\n| Binary Size | 27.2 MB | 10.3 MB |\\n\\nStripping of our other libraries (e.g. Python) is a work in progress.\\n\\n## Closing Remarks\\nAs usual, we would like to thank everyone in the K\xf9zu engineering team, especially our interns, for making this release possible.\\nWe look forward to your feedback!\\n\\nEnjoy K\xf9zu v 0.1.0 and the upcoming holiday season, which in this part of the world \ud83c\udde8\ud83c\udde6\ud83c\udde8\ud83c\udde6 coincides with\\ncoming of the cold and cozy winter \ud83e\udd17\ud83e\udd17."},{"id":"kuzu-0.0.12-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.12-release","source":"@site/blog/2023-10-31-kuzu-v-0.0.12.md","title":"K\xf9zu 0.0.12 Release","description":"We release K\xf9zu 0.0.12, another minor release. This release fixes a bug that prevents the database to be opened in read-only mode on a read-only file system. It also adds support for INT128 data type.","date":"2023-10-31T00:00:00.000Z","formattedDate":"October 31, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.24,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.12-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.1.0 Release","permalink":"/docusaurus/blog/kuzu-0.1.0-release"},"nextItem":{"title":"K\xf9zuExplorer: Visualizing Query Results and Schemas","permalink":"/docusaurus/blog/kuzuexplorer"}},"content":"We release K\xf9zu 0.0.12, another minor release. This release fixes a bug that prevents the database to be opened in read-only mode on a read-only file system. It also adds support for INT128 data type.\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.12)."},{"id":"kuzuexplorer","metadata":{"permalink":"/docusaurus/blog/kuzuexplorer","source":"@site/blog/2023-10-25-kuzuexplorer/index.md","title":"K\xf9zuExplorer: Visualizing Query Results and Schemas","description":"Today, we are happy to release K\xf9zuExplorer, which is K\xf9zu\'s browser-based frontend to","date":"2023-10-25T00:00:00.000Z","formattedDate":"October 25, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":3.445,"hasTruncateMarker":true,"authors":[{"name":"Chang Liu","url":"https://www.linkedin.com/in/mewim/","imageURL":"https://kuzudb.com/img/blog/chang.gif","key":"chang"}],"frontMatter":{"slug":"kuzuexplorer","authors":["chang"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.12 Release","permalink":"/docusaurus/blog/kuzu-0.0.12-release"},"nextItem":{"title":"K\xf9zu 0.0.11 Release","permalink":"/docusaurus/blog/kuzu-0.0.11-release"}},"content":"import DatasetsImage from \'./preexisting-datasets.png\';\\nimport SchemaPanelImage from \'./schema-panel.png\';\\nimport ShellPanelImage from \'./query-result-node-link-view.png\';\\n\\n\\nToday, we are happy to release K\xf9zuExplorer, which is K\xf9zu\'s browser-based frontend to\\nvisualize and explore database schemas and query results in the form of a graph, table, or in JSON.\\nThis is a very useful tool for exploring databases and debugging applications during prototyping\\nphase. This post describes a brief overview of the main features of K\xf9zuExplorer with pointers to\\n[K\xf9zuExplorer documentation](/kuzuexplorer) for details.\\n\\n\x3c!--truncate--\x3e\\n\\n## Launching K\xf9zuExplorer\\n\\nK\xf9zuExplorer is a web application that is launched from a deployed Docker image. Assuming you have Docker\\ninstalled before proceeding, you can launch K\xf9zuExplorer on an existing DBMS you have or on an empty database.\\nDetails about how to launch K\xf9zuExplorer can be found [here](/kuzuexplorer/#launching-k\xf9zuexplorer).\\nFor example, to start K\xf9zuExplorer on an empty\\ndatabase, you can simply run the following command on your shell, and then access K\xf9zuExplorer by going to\\n`http://localhost:8000`\\n\\n```\\ndocker run -p 8000:8000 --rm kuzudb/kuzu-ui:latest\\n```\\n\\nK\xf9zuExplorer comes bundled with several pre-existing databases, one of which you can use to get started.\\nTo load one of these databases, click the `Datasets` tab on the top right corner on your landing page\\nand then the `Load Dataset` button as shown in the below figure.\\n\\n
\\n\\n
\\n\\n## Schema Panel: Schema Exploring and Editing\\n\\nOne of the two main functionalities of K\xf9zuExplorer is to explore and modify the schema of your database.\\nBy clicking the `Schema` tab on the top right corner, you\'ll get to a page that shows you the\\nNode and Relationship tables in your database in a node-link view on the left. Using the right panel,\\nyou can do several things to explore and modify your tables, such as by adding new properties to your\\nnode/rel tables, inserting new node/rel tables, or dropping node/rel tables. These changes can all be done\\ninteractively by clicking buttons, which automatically generate and run the corresponding Cypher queries\\n(unless you have launched K\xf9zuExplorer [in read-only mode](/kuzuexplorer/#access-mode)).\\n\\n
\\n\\n
\\n\\nMore details\\nabout what can be done in the Schema panel can be found [here](/kuzuexplorer/schema-panel).\\n\\n## Shell Panel: Query Result Visualization\\n\\nUsing K\xf9zuExplorer, you can also issue Cypher queries similar to K\xf9zu\'s\\n[command line interface](/installation#command-line), and\\nvisualize the results of these queries.\\nTo issue queries go to the `Shell` tab on the right corner and you can type a Cypher query.\\nAs you type your query, K\xf9zuExplorer shell will suggest keyword completions, which can\\nhelp you write your queries. You can then click the green \\"play\\" icon on the left hand\\nside of the shell panel, which will execute your queries and display the results. The\\nresults can be displayed in three different modes: (i) a node-link graph view; (ii) a table; or (iii) as json.\\nAs an example, the below image presents the results of the following query which retrieves all nodes and edges\\nin the database in a node-link graph view:\\n\\n```\\nMATCH (a)-[e]->(b)\\nRETURN *\\n```\\n\\n
\\n\\n
\\n\\nYou can inspect individual nodes and edge in the query results by clicking on them. More details\\nabout what can be done in the Shell panel can be found [here](/kuzuexplorer/shell-panel).\\n\\n## Settings Panel: Configuring Visualizations\\n\\nThere is also a Settings tab on the right hand corner, which can be used for several more advanced\\nsetting changes, e.g., changing the colors or sizes of nodes of a certain type (e.g., `User` nodes) or\\nthe maximum number of nodes to plot on the node-link graph visualizations when visualizing query results.\\nDetails of these can be found [here](/kuzuexplorer/settings-panel).\\n\\n## Final Words\\n\\nK\xf9zuExplorer should be quite useful especially when developing your applications for exploration and debugging purposes, e.g.,\\nyou can interactively debug why your queries do not return the results you expect using K\xf9zuExplorer by exploring the\\nactual nodes and relationships in your database.\\n\\nThis is our first version of K\xf9zuExplorer and we will be improving it over time.\\nWe hope you enjoy using K\xf9zuExplorer and help us make it better! Please send us any feature or documentation requests or\\nbug reports by opening an issue in [K\xf9zuExplorer\'s github repo](https://github.com/kuzudb/explorer)!"},{"id":"kuzu-0.0.11-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.11-release","source":"@site/blog/2023-10-19-kuzu-v-0.0.11.md","title":"K\xf9zu 0.0.11 Release","description":"We release K\xf9zu 0.0.11, another minor release. The main new feature of this release is read-only access mode for the database on Linux. The read-only mode enables the upcoming K\xf9zu UI to optionally open a database in read-only mode while allowing other applications to access the same database concurrently.","date":"2023-10-19T00:00:00.000Z","formattedDate":"October 19, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.31,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.11-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zuExplorer: Visualizing Query Results and Schemas","permalink":"/docusaurus/blog/kuzuexplorer"},"nextItem":{"title":"K\xf9zu 0.0.10 Release","permalink":"/docusaurus/blog/kuzu-0.0.10-release"}},"content":"We release K\xf9zu 0.0.11, another minor release. The main new feature of this release is read-only access mode for the database on Linux. The read-only mode enables the upcoming [K\xf9zu UI](https://github.com/kuzudb/kuzu-ui) to optionally open a database in read-only mode while allowing other applications to access the same database concurrently.\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.11)."},{"id":"kuzu-0.0.10-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.10-release","source":"@site/blog/2023-10-14-kuzu-v-0.0.10.md","title":"K\xf9zu 0.0.10 Release","description":"We\'re happy to introduce K\xf9zu 0.0.10, which is a minor release with a bunch of bug fixes and improvements:","date":"2023-10-14T00:00:00.000Z","formattedDate":"October 14, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.7,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.10-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.11 Release","permalink":"/docusaurus/blog/kuzu-0.0.11-release"},"nextItem":{"title":"K\xf9zu 0.0.9 Release","permalink":"/docusaurus/blog/kuzu-0.0.9-release"}},"content":"We\'re happy to introduce K\xf9zu 0.0.10, which is a minor release with a bunch of bug fixes and improvements:\\n- Added the frame of reference encoding for integers. [PR 2140](https://github.com/kuzudb/kuzu/pull/2140)\\n- Fixed slicing of UTF-8 string. [PR 2212](https://github.com/kuzudb/kuzu/pull/2212)\\n- Fixed copying of invalid UTF-8. [PR 2208](https://github.com/kuzudb/kuzu/pull/2208)\\n- Added more checks and better error messages during the binding phase. [PR 2206](https://github.com/kuzudb/kuzu/pull/2206)\\n- Fixed return list literal with null values. [PR 2187](https://github.com/kuzudb/kuzu/pull/2187)\\n- Fixed bugs in scan multi label rel tables. [PR 2149](https://github.com/kuzudb/kuzu/pull/2149)\\n- Deprecated all functions for getting the table names and properties from the client APIs and the CLI, instead, `CALL` is introduced for the same functionality. [PR 2199](https://github.com/kuzudb/kuzu/pull/2199), [2207](https://github.com/kuzudb/kuzu/pull/2207)\\n- Added missing data type support in client APIs. [PR 2183](https://github.com/kuzudb/kuzu/pull/2183), [PR 2176](https://github.com/kuzudb/kuzu/pull/2176), [PR 2193](https://github.com/kuzudb/kuzu/pull/2193), [PR 2172](https://github.com/kuzudb/kuzu/pull/2172)\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.10)."},{"id":"kuzu-0.0.9-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.9-release","source":"@site/blog/2023-10-02-kuzu-v-0.0.9.md","title":"K\xf9zu 0.0.9 Release","description":"We are very happy to release K\xf9zu 0.0.9 today! This release comes with the following new main features and improvements:","date":"2023-10-02T00:00:00.000Z","formattedDate":"October 2, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":7.545,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.9-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.10 Release","permalink":"/docusaurus/blog/kuzu-0.0.10-release"},"nextItem":{"title":"K\xf9zu 0.0.8 Release","permalink":"/docusaurus/blog/kuzu-0.0.8-release"}},"content":"We are very happy to release K\xf9zu 0.0.9 today! This release comes with the following new main features and improvements:\\n\\n\x3c!--truncate--\x3e\\n\\n## New Features\\n\\n### Load From\\nK\xf9zu now supports loading directly from a file without importing into the database through the `LOAD FROM` clause. For instance, the following query counts the number of rows whose first column starts with \'Adam\'.\\n\\n```\\nLOAD FROM \\"user.csv\\"\\nWHERE column0 =~ \'Adam*\'\\nRETURN COUNT(*)\\n```\\n`LOAD FROM` can also be used as the input source for a bulk update.\\n```\\nLOAD FROM \\"user.csv\\"\\nCREATE (:Person {name: column0, age: to_int64(column1)});\\n```\\n\\nDetails can be found in the [LOAD FROM documentation page](/cypher/query-clauses/load_from).\\n\\n#### Header Schema\\nBy default, K\xf9zu will read the header of the file to detect column names and types. If no header is avaliable it will use auto-generated names and all columns will be strings. To manually specify the header, you can use `LOAD WITH HEADERS ... FROM ...`.\\n\\nFor example, the following query will load `name` as a string type for the first column and `age` as an INT64 type for the second column.\\n```\\nLOAD WITH HEADERS (name STRING, age INT64) FROM \\"user.csv\\"\\nWHERE name =~ \'Adam*\'\\nRETURN name, age;\\n```\\n\\nIf a header is manually specified, K\xf9zu will try to cast to the given type and throw exceptions if casting fails. More information can be found [here](/cypher/query-clauses/load_from).\\n\\n### Transaction Statement\\nThis release replaces the `beginReadTransaction()`, `beginWriteTransaction()`, `commit()` and `rollback()` APIs in all language bindings with explicit statements.\\n```\\nBEGIN TRANSACTION;\\nCREATE (a:User {name: \'Alice\', age: 72});\\nMATCH (a:User) RETURN *;\\nCOMMIT;\\n```\\nThe above sequence of statements starts a write transaction, adds a new node, and within the same transaction also reads all of the tuples in User table before committing the transaction. More info on the new transaction statement can be found [here](/cypher/transaction).\\n\\n### Comment on Table\\nYou can now add comments to a table using the `COMMENT ON TABLE` statement. The following query adds a comment to the `User` table.\\n```\\nCOMMENT ON TABLE User IS \'User information\';\\n```\\nComments can be extracted through the new `SHOW_TABLES()` function.\\n```\\nCALL SHOW_TABLES() RETURN *;\\n----------------------------------\\n| name | type | comment |\\n----------------------------------\\n| User | NODE | User information |\\n----------------------------------\\n| City | NODE | |\\n----------------------------------\\n```\\n\\n### Recursive Relationship Projection\\nThis release expands recursive relationship patterns and enables projection on intermediate nodes and relationships. Previously, K\xf9zu only supported returning all node and relationship properties on the path.\\n```\\nMATCH (a:User)-[e:Follows*1..2 (r, n | WHERE r.since > 2020)]->(b:User)\\nRETURN nodes(e), rels(e);\\n```\\nThis incurs a significant computational overhead when a user is only interested in a subset of properties on the path. Also, returning all properties makes the result harder to interpret.\\n\\nK\xf9zu now allows projection inside recursive relationship patterns using a list-comprehension-like syntax.\\n```\\nMATCH (a:User)-[e:Follows*1..2 (r, n | WHERE r.since > 2020 | {r.since}, {n.name})]->(b:User)\\nRETURN nodes(e), rels(e);\\n```\\nThe query above finds all paths between two users which are between 1 and 2 hops, and where the users followed each other after 2020. The query returns the `since` property of any `Follow` relationships and the name of any intermediate users.\\n\\nFor more information, check out [the new documentation](/cypher/query-clauses/match#project-intermediate-nodes-and-rels).\\n\\nThe performance improvements are shown in the [Performance Improvements](#performance-improvements) section.\\n\\n### CREATE REL GROUP[^1]\\n\\nWe have received a lot of feedback regarding the limitation that a relationship can only be defined over a single pair of node tables. This release introduces a `CREATE REL GROUP` statement which has a similar syntax to `CREATE REL TABLE`, but allows multiple `FROM ... TO ...` clauses. This statement will create a relationship table for each pair internally. When querying, a relationship group is simply syntatic sugar for any of the relationships in the group.\\n\\nFor example, the following statement creates a group containing a Knows_User_User relationship and a Knows_User_City relationship.\\n```\\nCREATE REL TABLE GROUP Knows (FROM User To User, FROM User to City, year INT64);\\n```\\nTo query with the group, simply treat it as any other relationship, so:\\n```\\nMATCH (a:User)-[:Knows]->(b) RETURN *;\\n```\\nThe query above is equivalent to\\n```\\nMATCH (a:User)-[:Knows_User_User|:Knows_User_city]->(b) RETURN *;\\n```\\n**Note**\\n- For `COPY FROM` and `CREATE`, we currently don\'t support using a relationship group so you need to explicitly specify a single relationship table.\\n\\nSee [Create Table](/cypher/data-definition/create-table) for more information.\\n\\n### Data Types & Functions\\nWe introduced a few more numerical data types:\\n- INT8: 1 byte signed integer\\n- UINT64: 8 byte unsigned integer\\n- UINT32: 4 byte unsigned integer\\n- UINT16: 2 byte unsigned integer\\n- UINT8: 1 byte unsigned integer\\n\\nWe have also added several casting and list functions. See [functions](/cypher/expressions/) for more information.\\n\\n## Performance Improvements\\n\\n### New CSV and Parquet Reader\\nIn this release, we have started replacing arrow\'s CSV and Parquet reader with our own lightweight and customized implementations.\\n\\nFollowing DuckDB\'s implementation, we\'ve replaced arrow\'s streaming CSV reader with a parallel one. The parallel CSV reader assumes there are no multi-line strings and provides a large performance boost on multi-threaded machines.\\n\\nIf multi-line strings are present, the CSV reading will fail, and you will need to fall back to single thread mode by setting `parallel=false`. See [Data Import from CSV Files](/data-import/csv-import).\\n\\nWe demonstrate the performance of our parallel csv reader through the new [LOAD FROM](#load-from) feature as follows.\\n```\\nLOAD FROM \\"ldbc-100/comment_0_0.csv\\" (header = true, delim = \'|\') RETURN COUNT(*);\\n```\\n\\n| # Threads | 1 | 2 | 4 | 8 | 16 |\\n| --------- | ----- | ----- | ----- | ----- | ----- |\\n| Time (s) | 297.19 | 170.71 (1.7x) | 109.38 (2.7x) | 69.01 (4.3x) | 53.28 (5.6x) |\\n\\n### Bitpacking Compression\\nWith this release, we have implemented our first compression algorithm! We are introducing the bitpacking compression algorithm for integers. It is useful when using a large integer type (e.g., INT32 or INT64) for storing small integers, which can be encoded more compactly with fewer bits. This helps both storage and query processing times.\\n\\nTo show the difference, we take the `length` column from LDBC `Comment` table as an example, which is of type `INT32` and whose values range from 2 to 1998.\\nTogether with an auto-increment `ID` column as the primary key, we create a node table `(ID INT64, length INT32, PRIMARY KEY(ID))`. The loaded data file size, and loading time is listed in the below table. Data file size is largely reduced from 2.6GB to 1.1GB (2.4x), while the data loading time stays the same (75.69s vs. 75.84s).\\n\\nReduced data file size also helps reduce disk I/O operations, which can improve query scan performance. We show that with a query that sums all the lengths.\\n```\\nMATCH (l:length) RETURN sum(l.length);\\n```\\nThe query time improved from 1.64s to 0.45s (3.6x)!\\n\\n| | Data size | Loading time | Query time |\\n| --------------- | --------- | -------------- | ------------ |\\n| Without compression | 2.6GB | 75.69s | 1.64s |\\n| With compression | **1.1GB (2.4x)** | **75.84s** | **0.45s (3.6x)** |\\n\\nMore compressions on integers, floats, and string values will be coming soon. Please stay tuned!\\n\\nNote: The compression is currently only done on node tables. It will be adapted to rel tables in our next release. By default, we turn on compression for all node tables. To disable it, we provide an option when starting the database. For example, starting our CLI with `--nocompress` option can disable compression on all write statements to node tables.\\n\\n### General Data Loading Improvement\\nData loading time is improved due the following changes:\\n- Parallel csv reader.\\n- Compression means we write less data to disk.\\n- Removed line counting when copying rel tables.\\n- Dedicated casting functions to avoid string copy.\\n- Reduced hash index file size.\\n\\n| Files | # Lines | CSV file size | v0.0.8 | v0.0.9 |\\n| ---------------- | ----------- | ------------- | ----------- | ----------- |\\n| comment.csv | 220M | 22.49 GB | 187.76s | **131.48s** |\\n| person.csv | 0.45M | 43.6M | 1.16s | **0.78s** |\\n| likesComment.csv | 242M | 13 GB | 250.64s | **210.72s** |\\n| knows.csv | 20M | 1.1 GB | 24.40s | **19.54s** |\\n\\n\\n### Projection Pushdown for Recursive Joins\\nThe following two queries both compute paths along the Knows relationship with 1 to 3 hops from a single starting point, and then returns the firstName of all nodes along the path.[^2]\\n\\nWithout Projection:\\n```\\nMATCH (a:Person)-[e:Knows*1..3]->(b:Person)\\nWHERE a.ID = 933\\nRETURN properties(nodes(e), \'firstName\');\\n```\\n\\nWith Projection:\\n```\\nMATCH (a:Person)-[e:Knows*1..3 (r, n | {}, {n.firstName})]->(b:Person)\\nWHERE a.ID = 933\\nRETURN properties(nodes(e), \'firstName\');\\n```\\n\\n| With projection | Without projection |\\n|---------------------- | ----------------------- |\\n| **471.9** ms | 3412.8 ms |\\n\\nWith projection, the optimizer can completely avoid materializing a hash table for relationship properties which is a major bottleneck in computation.\\n\\n[^1]: This is an experimental feature and might be changed in the future.\\n[^2]: This experiment was carried out on an M1 Macbook Pro with 16GB of memory and 8 threads. Sideway information passing is disabled."},{"id":"kuzu-0.0.8-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.8-release","source":"@site/blog/2023-08-28-kuzu-v-0.0.8.md","title":"K\xf9zu 0.0.8 Release","description":"We\'re here to introduce K\xf9zu 0.0.8, which is a minor bug-fix release together with some performance optimizations:","date":"2023-08-28T00:00:00.000Z","formattedDate":"August 28, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.64,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.8-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.9 Release","permalink":"/docusaurus/blog/kuzu-0.0.9-release"},"nextItem":{"title":"K\xf9zu 0.0.7 Release","permalink":"/docusaurus/blog/kuzu-0.0.7-release"}},"content":"We\'re here to introduce K\xf9zu 0.0.8, which is a minor bug-fix release together with some performance optimizations:\\n- Fixed a major bug in COPY on large datasets. [PR 1963](https://github.com/kuzudb/kuzu/pull/1963)\\n- Implemented the [TopK optimization](https://github.com/kuzudb/kuzu/pull/1949), significantly enhancing the performance of queries that involve ORDER BY and LIMIT clauses. We will delve deeper into this optimization in a blog post. [PR 1949](https://github.com/kuzudb/kuzu/pull/1949)\\n- WITH clause (CTE) rewriter. We avoid the evaluation of node and rel in CTE projection if it\'s not needed for further processing. [PR 1956](https://github.com/kuzudb/kuzu/pull/1956)\\n- Updated our Rust doc with converting query result to arrow arrays.\\n- Fixed the size allocated for boolean values to match the size of the bit-packed data. [PR 1953](https://github.com/kuzudb/kuzu/pull/1953/files)\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.8)."},{"id":"kuzu-0.0.7-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.7-release","source":"@site/blog/2023-08-16-kuzu-v-0.0.7.md","title":"K\xf9zu 0.0.7 Release","description":"We are very happy to release K\xf9zu 0.0.7 today! This release comes with the following new main features and improvements:","date":"2023-08-16T00:00:00.000Z","formattedDate":"August 16, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":7.53,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.7-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.8 Release","permalink":"/docusaurus/blog/kuzu-0.0.8-release"},"nextItem":{"title":"IAMGraphViz: Visualizing AWS IAM Permissions with K\xf9zu","permalink":"/docusaurus/blog/iamgraphviz"}},"content":"We are very happy to release K\xf9zu 0.0.7 today! This release comes with the following new main features and improvements: \\n\\n- [Macro and UDF](2023-08-16-kuzu-v-0.0.7.md#macro-and-udf)\\n - [Create Macro Statements](2023-08-16-kuzu-v-0.0.7.md#create-macro-statements)\\n - [C++ UDFs](2023-08-16-kuzu-v-0.0.7.md#c-udfs)\\n- [Data Update and Return Clauses](2023-08-16-kuzu-v-0.0.7.md#data-update-and-return-clauses)\\n - [Merge Clause](2023-08-16-kuzu-v-0.0.7.md#merge-clause)\\n - [Multi-label Set/Delete](2023-08-16-kuzu-v-0.0.7.md#multi-label-setdelete)\\n - [Return After Update](2023-08-16-kuzu-v-0.0.7.md#return-after-update)\\n - [Return with .\\\\*](2023-08-16-kuzu-v-0.0.7.md#return-with-)\\n- [Data Export](2023-08-16-kuzu-v-0.0.7.md#data-export)\\n- [New Data Types and APIs](2023-08-16-kuzu-v-0.0.7.md#new-data-types-and-apis)\\n - [MAP](2023-08-16-kuzu-v-0.0.7.md#map)\\n - [UNION](2023-08-16-kuzu-v-0.0.7.md#union)\\n - [Converting Query Results to Arrow](2023-08-16-kuzu-v-0.0.7.md#converting-query-results-to-arrow)\\n- [NodeGroup Based Node Table Storage](2023-08-16-kuzu-v-0.0.7.md#nodegroup-based-node-table-storage)\\n- [Unnesting Arbitrary Subqueries](2023-08-16-kuzu-v-0.0.7.md#unnesting-arbitrary-subqueries)\\n\\n\x3c!--truncate--\x3e\\n\\nFor installing the new version, \\nplease visit the [download section of our website](https://kuzudb.com/#download) \\nand [getting started guide](https://kuzudb.com/docusaurus/getting-started/). The full\\n[release notes are here](https://github.com/kuzudb/kuzu/releases). \\n\\n## Macro and UDF\\n### Create Macro Statements\\nIn this release, we\'ve added the support of `CREATE MACRO` statement to define customized scalar functions, i.e., those that return only a single value, through Cypher.\\n\\nHere is an example of defining a macro to add two input parameters. The second parameter `b:3` is an example of how to provide a default value for a parameter in case the parameter is absent.\\n```Cypher\\n// Create a macro which adds two parameters. If the second parameter b is not provided, the default value of 3 will be used instead.\\ncreate macro addWithDefault(a,b:=3) as a + b;\\n// Executes the macro without providing the default value.\\nreturn addWithDefault(2); // returns 5 (2 + 3)\\n// Executes the macro by providing the default value (actual parameter value will be used).\\nreturn addWithDefault(4, 7); // returns 11 (4 + 7)\\n```\\nSee more details on supported macro expression types [here](./../cypher/macro).\\n\\n### C++ UDFs\\nWe are also introducing two C++ interfaces, `createScalarFunction` and `createVectorizedFunction` in the `Connection` class of the [C++ API](https://kuzudb.com/docusaurus/getting-started/cpp) to define both scalar and vectorized [UDFs](./../client-apis/cpp-api/udf).\\n\\n`createScalarFunction` provides a way for users to define scalar functions in C++ and use it in K\xf9zu as if they\'re built-in functions.\\nHere is an example of a unary scalar function that increments the input value by 5:\\n```cpp\\nstatic int32_t addFiveScalar(int32_t x) {\\n return x + 5;\\n}\\n// Register the unary scalar function using the createScalarFunction API.\\nconn->createScalarFunction(\\"addFiveScalar\\", &addFiveScalar);\\n// Issue a query using the UDF.\\nconn->query(\\"MATCH (p:person) return addFiveScalar(to_int32(p.age))\\");\\n```\\n\\nFor users familiar with internals of our intermediate result representation, they can make use of `createVectorizedFunction` to create vectorized function over our ValueVectors to achieve better performance.\\nSee [our doc here](./../client-apis/cpp-api/udf) for more details.\\n\\n## Data Update and Return Clauses\\n### Merge Clause\\nThis release implements the `MERGE` clause, which is an updating clause that will first try to match the given pattern and, if not found, create the pattern. At a high level, `MERGE ` can be interpreted as `If MATCH then RETURN ELSE CREATE `.Additionally, one can further specify the `SET` operation based on whether the pattern is found or not through `ON CREATE` and `ON MATCH`.\\n\\nFor example, the following query tries to merge a user node with name \\"Adam\\". Suppose a node with name \\"Adam\\" exists in the database already. In this case, we update the same node\'s `age` property and return the node (so no new node gets inserted).\\n```\\nMERGE (n:User {name : \'Adam\'}) ON MATCH SET n.age = 35 RETURN n.*;\\n------------------\\n| n.name | n.age |\\n------------------\\n| Adam | 35 |\\n------------------\\n```\\nHere is another example where we try to merge a `Follows` edge with `since` property equal to 2022 between `Adam` and `Karissa`. Suppose no such edge exists in the database, then the statement create the edge and set the `since` property to 1999.\\n```\\nMATCH (a:User), (b:User) \\nWHERE a.name = \'Adam\' AND b.name = \'Karissa\' \\nMERGE (a)-[e:Follows {since:2022}]->(b) \\nON CREATE SET e.since = 1999\\nRETURN e;\\n---------------------------------------------------------\\n| e |\\n---------------------------------------------------------\\n| (0:0)-{_LABEL: Follows, _ID: 0:5, since: 1999}->(0:1) |\\n---------------------------------------------------------\\n```\\nSee [our doc here](./../cypher/data-manipulation-clauses/merge) for more details.\\n\\n### Multi-label Set/Delete\\n\\nK\xf9zu now allows set/delete on nodes and relationship variables that can be binding to multiple labels. For example,\\nto delete all nodes in database (assuming all edges have been deleted).\\n```\\nMATCH (n) DELETE n;\\n```\\nSimilarly, to set `since` property of all relationships in the database.\\n```\\nMATCH ()-[f]->() SET f.since = 2023\\n```\\nNote that when evaluating this query, tuples in tables that don\'t have `since` property will be ignored.\\n\\nSee our docs in [Set](./../cypher/data-manipulation-clauses/set) and [Delete](./../cypher/data-manipulation-clauses/delete) for more details.\\n\\n### Return After Update\\n\\nWe are also enabling return after updating clause starting from this release. That is updated value will be returned in queries that update values. Here are some examples:\\n\\n```\\nMATCH (u:User)\\nWHERE u.name = \'Adam\' SET u.age = NULL\\nRETURN u.*;\\n------------------\\n| u.name | u.age |\\n------------------\\n| Adam | |\\n------------------\\n\\nMATCH (u1:User), (u2:User)\\nWHERE u1.name = \'Adam\' AND u2.name = \'Noura\' \\nCREATE (u1)-[e:Follows {since: 2011}]->(u2)\\nRETURN e;\\n---------------------------------------------------------\\n| e |\\n---------------------------------------------------------\\n| (0:0)-{_LABEL: Follows, _ID: 0:5, since: 2011}->(0:3) |\\n---------------------------------------------------------\\n```\\n\\nSee our docs in [Set](./../cypher/data-manipulation-clauses/set) and [Delete](./../cypher/data-manipulation-clauses/delete) for more examples.\\n\\n### Return with .*\\nAs a syntactic sugar, K\xf9zu now supports returning all properties of node or rel with *.\\n```\\nMATCH (a:User) RETURN a.*;\\n-------------------\\n| a.name | a.age |\\n-------------------\\n| Adam | 30 |\\n-------------------\\n| Karissa | 40 |\\n-------------------\\n| Zhang | 50 |\\n-------------------\\n| Noura | 25 |\\n-------------------\\n```\\n\\nSee [our doc here](./../cypher/query-clauses/return#returning-node-and-relationship-properties) for more details.\\n\\n## Data Export\\nK\xf9zu now supports exporting query results to CSV files using the `COPY TO` command. For example the following\\n`COPY TO` statement could return the below CSV file.\\n```\\nCOPY (MATCH (u:User) RETURN u.*) TO \'user.csv\';\\n```\\nCSV file:\\n```\\nu.name,u.age\\n\\"Adam\\",30\\n\\"Karissa\\",40\\n\\"Zhang\\",50\\n\\"Noura\\",25\\n```\\nSee [Data Export](../data-export/) for more information.\\n\\n## New Data Types and APIs\\n### MAP\\nA `MAP` is a dictionary of key-value pairs where all keys have the same type and all values have the same type. Different from `STRUCT`, `MAP` doesn\'t require the same key to be present in each row. Therefore, `MAP` is more suitable when the schema is not determined.\\n\\n```\\nRETURN map([1, 2], [\'a\', \'b\']) AS m;\\n--------------\\n| m |\\n--------------\\n| {1=a, 2=b} |\\n--------------\\n```\\n\\nSee [map](../cypher/data-types/map) for more information.\\n\\n### UNION\\nK\xf9zu\'s `UNION` is implemented by taking DuckDB\'s `UNION` type as a reference. Similar to C++ `std::variant`, `UNION` is a nested data type that is capable of holding multiple alternative values with different types. The value under key \\"tag\\" is considered as the value being currently hold by the `UNION`.\\n\\nSee [union](../cypher/data-types/union) for more information.\\n\\n### Converting Query Results to Arrow\\nIn previous releases, we supported converting query result to Arrow tables in our [Python API](https://kuzudb.com/api-docs/python/kuzu/query_result.html#QueryResult.get_as_arrow).\\nIn this release, converting to Arrow arrays are now also available in Rust, [C](https://kuzudb.com/api-docs/c/kuzu_8h.html) (see `kuzu_query_result_get_arrow_schema` and `kuzu_query_result_get_next_arrow_chunk`), and [C++](https://kuzudb.com/api-docs/cpp/classkuzu_1_1main_1_1_query_result.html) (see `getArrowSchema` and `getNextArrowChunk`) APIs.\\n\\n## NodeGroup Based Node Table Storage\\nThis release introduces changes the storage layout of node tables.\\nBefore this release, we used to store each column in a node table contiguously in separate files.\\nEach column contains one data file (e.g., `n-1.col`) and one null file (e.g., `n-1.null`) if the column may contain null values.\\nThis design posed two problems: 1) it requires maintaining many files in the database directory, which may lead to `too many open files` error; 2) it is not suitable for data compression. Although we still don\'t implement compression yet (this will wait until the next few releases), this design would force us to adopt a single compression technique for the entire column. \\n\\nInstead, partitioning each column into multiple chunks can offer more flexibility as each column chunk can be compressed and decompressed independently.\\nIn this release, we introduced the concept [NodeGroup](https://github.com/kuzudb/kuzu/issues/1474), which is equivalent to [RowGroup](https://parquet.apache.org/docs/concepts/) and represents a horizontal partition of a table.[^1] \\nWith node group-based storage design, we also store data of all columns in a single file `data.kz`.[^2]\\nThis will enable more powerful compression schemes, e.g., constant compression, bit-packing, dictionary compression in the coming releases.\\nFor details on our new design, please visit [this issue](https://github.com/kuzudb/kuzu/issues/1474).\\n\\n[^1]: We use the term NodeGroup mainly due to that we also partition rel tables based on their src/dst nodes, instead of number of rows.\\n[^2]: Primary key index files are still kept separately, but eventually they will also be merged into the `data.kz` file.\\n\\n## Unnesting Arbitrary Subqueries\\n\\nConsider the following query that finds the name of users `a` who have at least 1 user `b` who is younger than `a`:\\n```\\nMATCH (a:User) \\nWHERE EXISTS { MATCH (a)-[:Follows]->(b:User) WHERE a.age > b.age} \\nRETURN a.name;\\n```\\nThe query inside `EXISTS` is a correlated subquery and very expensive to evaluate because the inner subquery needs to be evaluated for each `a` with a nested loop join operator (which is often an inefficient way to evaluate joins). In this release, we implemented an optimization that unnests correlated subqueries based on the techniques adopted from this paper [Unnesting Arbitrary Queries](https://cs.emis.de/LNI/Proceedings/Proceedings241/383.pdf) by Neumann and Kemper. This allows us to use hash joins instead of nested loop joins and execute these queries much faster. More details will come in a separate blog post on both this technique and how much gains we obtain."},{"id":"iamgraphviz","metadata":{"permalink":"/docusaurus/blog/iamgraphviz","source":"@site/blog/2023-07-19-iamgraphviz/index.md","title":"IAMGraphViz: Visualizing AWS IAM Permissions with K\xf9zu","description":"IAMGraphViz Overview","date":"2023-07-19T00:00:00.000Z","formattedDate":"July 19, 2023","tags":[{"label":"use-case","permalink":"/docusaurus/blog/tags/use-case"}],"readingTime":6.03,"hasTruncateMarker":true,"authors":[{"name":"Chris Norman","title":"Common Fate","url":"https://www.linkedin.com/in/chrnorm/?originalSubdomain=uk","image_url":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75","imageURL":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75"},{"name":"Chang Liu","url":"https://www.linkedin.com/in/mewim/","imageURL":"https://kuzudb.com/img/blog/chang.gif","key":"chang"}],"frontMatter":{"slug":"iamgraphviz","authors":[{"name":"Chris Norman","title":"Common Fate","url":"https://www.linkedin.com/in/chrnorm/?originalSubdomain=uk","image_url":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75","imageURL":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75"},"chang"],"tags":["use-case"]},"prevItem":{"title":"K\xf9zu 0.0.7 Release","permalink":"/docusaurus/blog/kuzu-0.0.7-release"},"nextItem":{"title":"K\xf9zu 0.0.6 Release","permalink":"/docusaurus/blog/kuzu-0.0.6-release"}},"content":"import SchemaImage from \'./schema.png\';\\nimport ReadOnlyVizImage from \'./readonlyviz.png\';\\nimport AdminVizImage from \'./adminviz.png\';\\n\\n\\n## IAMGraphViz Overview\\n\\n[Common Fate](https://www.commonfate.io/) is a framework for managing complex cloud permissions. They provide tools to simplify access at scale to AWS, Azure, and Google Cloud accounts. You can learn about what you can do with Common Fate on [their website](https://www.commonfate.io/). Here, we will talk about a recent proof of concept graph visualization tool called IAMGraphViz that [Chang Liu](https://www.linkedin.com/in/mewim/) (who is coauthoring this post) and I developed using K\xf9zu! IAMGraphViz is intended for infrastructure engineers to dig deep into the permission assignments in AWS IAM Identity Center using graph visualization. Using IAMGraphViz, one can easily visualize who has what type of access to different accounts on AWS as well as how they have access to these accounts. This is all done by analyzing the paths from users to accounts in a graph visualization, where the nodes and edges model users, accounts, groups, group memberships, permission sets and other entities in the AWS IAM Identity Center system.\\n\\n\x3c!--truncate--\x3e\\n\\nThe IAMGraphViz project is designed and implemented as a web application using a graph DBMS (GDBMS) to store and retrieve data. Before landing on K\xf9zu, we surveyed using several other GDBMSs, such as Neo4j, but they were all harder to use. Neo4j, for example, requires hosting a separate database. We then discovered K\xf9zu, which only required a `pip install` and import statement and we could simply embed it into our application. In this project our datasets could fit entirely onto a single compute node,and so K\xf9zu was far simpler for us to work with than alternatives. K\xf9zu is also far cheaper and more serverless-friendly than running a separate database.\\n\\nThis post follows the [Colab](https://colab.research.google.com/drive/1fotlNnOj1FGad6skBG7MRrHVdHd3jIl6) that Chang Liu created after we discussed this use case together.\\n\\nSo let\'s get to it!\\n\\n## Quick AWS IAM Overview\\n\\nWe will use the data model shown in the figure below that faithfully (but partially) models the\\ncore concepts of AWS IAM permission management. Let\'s first review these concepts, all\\nof which will be modeled as nodes in K\xf9zu, as a background.\\nWe will provide as simple definitions as we can to keep the post short and provide links\\nto necessary AWS IAM documentation: \\n\\n
\\n\\n
\\n\\n1. **[User](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users.html)** represents a \\nuser, e.g., an actual human user, who can get access to AWS accounts (and through accounts to AWS resources).\\n\\n2. **[Group](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_groups.html)** is a collection of IAM users and lets you specify permissions for multiple users at a time (e.g., you could have a user group called Admins with typical administrator permissions).\\nTo follow the APIs we use, instead of linking Users to Groups through a direct edge, we will do this (a bit redundantly) through a GroupMembership node.\\n\\n3. **[Account](https://docs.aws.amazon.com/organizations/latest/userguide/orgs_getting-started_concepts.html#account)**: An AWS account is the basic container for your AWS resources, such as s3 buckets,\\nAmazon Relational Database Service (Amazon RDS) databases, or Amazon Elastic Compute Cloud instances.\\nUsing multiple AWS accounts is a common practice for many reasons, e.g., providing a natural billing boundary for costs or isolating resources for security. Common Fate customers have hundreds of \\naccounts, which is not an extreme.\\n\\n4. **[IAM Policy](https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html)**, and **[ManagedPolicy](https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies_managed-vs-inline.html#aws-managed-policies)**: An IAM policy contains permissions for using some AWS resources. An AWS managed policy is a policy with a unique Amazon Resource Name (ARN), e.g., `arn:aws:iam::aws:policy/IAMReadOnlyAccess`, that is administered by AWS. Managed policies are common policies used by many enterprises. Managed policies are simpler to use than writing your custom policies. \\nFor simplicity, we will only model AWS managed policies in this post.\\n\\n5. **[PermissionSet](https://docs.aws.amazon.com/singlesignon/latest/userguide/permissionsetsconcept.html)** is a set of policies that can be attached to users or groups (through AccountAssignments which we explain momentarily). For example, you can create a Database Admin permission set that includes policies for administering Amazon RDS, DynamoDB, and Aurora services, and use that single permission set to grant access to a list of target AWS accounts. Similar to GroupMembership nodes, to follow the APIs we use, instead of linking ManagedPolicy nodes to PermissionSet nodes through a direct edge, we will link them through a ManagedPolicyAttachment node.\\n\\n6. **[Account Assignment](https://aws.amazon.com/about-aws/whats-new/2020/09/aws-single-sign-on-adds-account-assignment-apis-and-aws-cloudformation-support-to-automate-multi-account-access-management/)**: We will connect user and/or groups to AWS accounts with a specific permission set through an `AccountAssignment` node (see the schema above). \\n\\n## Example Visualizations\\n\\n### Data Generation\\nIn the attached [Colab notebook](https://colab.research.google.com/drive/1fotlNnOj1FGad6skBG7MRrHVdHd3jIl6), we first generate some test data\\ncontaining Users, Groups, ManagedPolicies, PermissionSets etc. For simplicity, we assume that there are three fixed groups: \\"Admins\\", \\"Developers\\", and \\"Auditors\\" and three ManagedPolicies: \\"AdministratorAccess\\", \\"PowerUserAccess\\", and \\"ReadOnlyAccess\\". Users, Accounts, \\nAccountAssignments, and PermissionSets are randomly generated and we randomly link different nodes to\\nother nodes according to our schema.\\n\\n### Visualization 1: Draw all users with direct or indirect `ReadOnlyAccess` access to an account\\n\\nIn our first query, we are given a particular account we would like to investigate and find\\nall users who have `ReadOnlyAccess` to the resources of this account. Let\'s assume\\nthe account\'s name is \\"account-db2071\\".\\n \\n``` cypher\\nMATCH (u:User)<-[l*1..3]-(aa:AccountAssignment)-[l5]-(a:Account),\\n(aa:AccountAssignment)-[aaps]->(ps:PermissionSet)<-[psmpa]-(mpa:ManagedPolicyAttachment)-[mpap]->(p:ManagedPolicy)\\nWHERE p.id = \\"arn:aws:iam::aws:policy/ReadOnlyAccess\\" AND a.sid = \\"account-db2071\\"\\nRETURN *;\\n```\\n\\nIn the actual IAMGraphViz implementation, we template this query with two parameters, one for the \\naccount ID, and one for the managed policy, which users pick interactively by selecting from\\na dropdown menu.\\nNote also that the `[:*1..3]` binding is a variable-length path because we want to find\\nboth the direct connections from a `User` to an `AccountAssignment` (that is further connected to\\n`ManagedPolicy`) as well as \\nindirect connections through a `Group` node. The visualization we generate is shown below:\\n\\n
\\n\\n
\\n\\nNote the presence of both directly and indirectly connected users to the account.\\nThe visualization in both the actual implementation and the [Colab notebook](https://colab.research.google.com/drive/1fotlNnOj1FGad6skBG7MRrHVdHd3jIl6) is generated simply \\nby converting the results of the query into the node and link objects of the graph visualization library,\\ne.g., pyvis in the case of the Colab notebook.\\n\\n### Visualization 2: Draw all accounts a user has `AdministratorAccess` to\\n\\nIn our second query, we are given a particular user we would like to investigate and find all accounts that the user has `AdministratorAccess` to. Let\'s assume the user\'s name is \\"Steven Rose\\". \\n\\nTo retrive the accounts, we define a Cypher query very similar to the previous one. The only difference is that, instead of using the account as query predicate, we now use the user. The query is as follows:\\n\\n``` cypher\\nMATCH (u:User)<-[l*1..3]-(aa:AccountAssignment)-[l5]-(a:Account),\\n(aa:AccountAssignment)-[aaps]->(ps:PermissionSet)<-[psmpa]-(mpa:ManagedPolicyAttachment)-[mpap]->(p:ManagedPolicy)\\nWHERE p.id = \\"arn:aws:iam::aws:policy/AdministratorAccess\\" AND u.name = \\"Steven Rose\\"\\nRETURN *;\\n```\\n\\nThe visualization we generate is shown below:\\n\\n
\\n\\n
\\n\\n## Closing Words\\nMany other graph visualizations can be helpful for infrastructure engineers to analyze the \\nIAM network of an enterprise. For example, to find inconsistent privileges given to users,\\nwe might want to *find and plot multiple paths from a user to an account with different privileges*.\\nOr we might want to extend our model with more fine grained resources that are connected to accounts\\nand analyze paths from users to these resources (see the [PMapper](https://github.com/nccgroup/PMapper) project that models the IAM data in a more detailed way). The key takeaway is this: graph visualizations can be very powerful to analyze cloud permission data and embedding K\xf9zu into your applications\\nto develop tools like IAMGraphViz is extremely easy and fun \ud83e\udd73\ud83d\ude4c\ud83d\udcaa!"},{"id":"kuzu-0.0.6-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.6-release","source":"@site/blog/2023-07-17-kuzu-v-0.0.6.md","title":"K\xf9zu 0.0.6 Release","description":"We are thrilled to announce the release of K\xf9zu 0.0.6, which focuses on addressing bugs reported by our users. We addressed the following issues in this bug-fix release:","date":"2023-07-17T00:00:00.000Z","formattedDate":"July 17, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.575,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.6-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"IAMGraphViz: Visualizing AWS IAM Permissions with K\xf9zu","permalink":"/docusaurus/blog/iamgraphviz"},"nextItem":{"title":"K\xf9zu 0.0.5 Release","permalink":"/docusaurus/blog/kuzu-0.0.5-release"}},"content":"We are thrilled to announce the release of K\xf9zu 0.0.6, which focuses on addressing bugs reported by our users. We addressed the following issues in this bug-fix release:\\n\\n1. Resolved a segmentation fault occurring while loading overflow data types with parallelism.\\n2. Fixed an issue of reading out of bound for LIST vector null buffer.\\n3. Implemented several missing data types in C, JAVA, Rust, and Python API bindings.\\n\\nFor more detailed information about the changes in this release, please visit [this link](https://github.com/kuzudb/kuzu/releases/tag/v0.0.6). \\n\\nWe extend our sincere gratitude to all our users who reported these bugs, as well as to everyone who supported us throughout this process. Your feedback is instrumental in making K\xf9zu better!"},{"id":"kuzu-0.0.5-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.5-release","source":"@site/blog/2023-07-10-kuzu-v-0.0.5.md","title":"K\xf9zu 0.0.5 Release","description":"We are very happy to release K\xf9zu 0.0.5 today! This release comes with the following new main features and improvements:","date":"2023-07-10T00:00:00.000Z","formattedDate":"July 10, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":4.14,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.5-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.6 Release","permalink":"/docusaurus/blog/kuzu-0.0.6-release"},"nextItem":{"title":"K\xf9zu 0.0.4 Release","permalink":"/docusaurus/blog/kuzu-0.0.4-release"}},"content":"We are very happy to release K\xf9zu 0.0.5 today! This release comes with the following new main features and improvements: \\n\\n- [Cypher Features](2023-07-10-kuzu-v-0.0.5.md#cypher-features)\\n - [Named Path](2023-07-10-kuzu-v-0.0.5.md#named-path)\\n - [Filters of Relationships in Recursive Patterns](2023-07-10-kuzu-v-0.0.5.md#filters-of-relationships-in-recursive-patterns)\\n - [All Shortest Paths](2023-07-10-kuzu-v-0.0.5.md#all-shortest-paths)\\n - [`Call` Clause](2023-07-10-kuzu-v-0.0.5.md#call-clause)\\n- [Modifying Database Configuration](2023-07-10-kuzu-v-0.0.5.md#modifying-database-configurations)\\n- [Data Types](2023-07-10-kuzu-v-0.0.5.md#data-types)\\n - [`BLOB`](2023-07-10-kuzu-v-0.0.5.md#blob)\\n- [Client APIs: Rust and Java](2023-07-10-kuzu-v-0.0.5.md#client-apis-rust-and-java)\\n- [Development:Testing Framework](2023-07-10-kuzu-v-0.0.5.md#development-testing-framework)\\n\\n\x3c!--truncate--\x3e\\n\\n## Cypher Features\\n\\n### Named Paths\\nThis releases introduces named paths. Users can now assign a named variable to a connected graph pattern. For example, the following query returns all path between `Adam` and `Karissa`.\\n```\\nMATCH p = (a:User)-[:Follows]->(b:User) \\nWHERE a.name = \'Adam\' AND b.name = \'Karissa\' \\nRETURN p;\\n```\\nNamed paths can also be assigned to recursive graph patterns as follows:\\n```\\nMATCH p = (a:User)-[:Follows*1..2]->(:User)-[:LivesIn]->(:City) \\nWHERE a.name = \'Adam\' \\nRETURN p;\\n```\\nOne can also assign multiple named paths in a `MATCH` clause\\n```\\nMATCH p1 = (a:User)-[:Follows]->(b:User), p2 = (b)-[:LivesIn]->(:City) \\nWHERE a.name = \'Adam\' \\nRETURN p1, p2;\\n```\\nInternally, a path is processed as a `STRUCT` with two fields, a nodes field with key `_NODES` and type `LIST[NODE]` and a rels field with key `_RELS` and type `LIST[REL]`. See [`PATH`](https://kuzudb.com/docusaurus/cypher/data-types/path) for details. Users can access nodes and rels field with `nodes(p)` and `rels(p)` function calls as follows:\\n```\\nMATCH p = (a:User)-[:Follows*1..2]->(:User) \\nWHERE a.name = \'Adam\' \\nRETURN nodes(p), (rels(p)[1]).since;\\n```\\n\\n### Filters of Relationships in Recursive Patterns\\nUsers can now put predicates on the relationships that will be \\"traversed/joined\\" in recursive patterns.\\nFor example, the following query finds the name of users that are followed by Adam directly or indirectly through 2 hops where *the following started before 2022 (r.since < 2022 predicate)*:\\n```\\nMATCH p = (a:User)-[:Follows*1..2 (r, _ | WHERE r.since < 2022)]->(b:User)\\nWHERE a.name = \'Adam\' \\nRETURN DISTINCT b.name;\\n```\\nOur filter grammar follows [Memgraph\'s syntax](https://memgraph.com/docs/memgraph/reference-guide/built-in-graph-algorithms). The first variable `r` in the `(r, _ | WHERE r.since < 2022)` predicate binds to the relationships in the recursive pattern and the `_` binds to the nodes. Since we currently don\'t allow filters on recursive nodes, the second variable must be `_` for now.\\n\\n### All Shortest Paths\\nK\xf9zu now supports all shortest paths semantic with key word `ALL SHORTEST`. The following query finds all shortest paths of up to length 3 between `Zhang` and `Waterloo` considering relationships of all labels (i.e., this is an unlabeled query and you can restrict the labels by adding them as `[:Follows* ALL SHORTEST 1..3]`).\\n```\\nMATCH p = (a)-[* ALL SHORTEST 1..3]-(b) \\nWHERE a.name = \'Zhang\' AND b.name = \'Waterloo\' \\nRETURN p;\\n```\\nSee [All Shortest Paths](https://kuzudb.com/docusaurus/cypher/query-clauses/match#all-shortest-path) on our documentation for more information.\\n\\n### `Call` Clause\\n\\nThis release introduces `Call` as a reading clause. Similar to [Neo4j](https://neo4j.com/docs/cypher-manual/current/clauses/call/), `Call` clause is used to execute procedures. The release also contains a set of predefined procedures that can be used to query the database schemas. For example, the following query returns all metadata of `User` table:\\n```\\nCALL table_info(\'User\') RETURN *;\\n---------------------------------------------\\n| property id | name | type | primary key |\\n---------------------------------------------\\n| 0 | name | STRING | True |\\n---------------------------------------------\\n| 1 | age | INT64 | False |\\n---------------------------------------------\\n```\\n\\n`Call` can be used together with other clauses in the same way as a reading clause:\\n```\\nCALL table_info(\'User\') WITH * WHERE name STARTS WITH \'a\' RETURN name;\\n--------\\n| name |\\n--------\\n| age |\\n--------\\n```\\n\\nMore built in procedures can be found [here](https://kuzudb.com/docusaurus/cypher/query-clauses/call).\\n\\n## Modifying Database Configurations\\n\\n`CALL` has another usage: you can now modify database configurations through a `Call param=x` pattern. For example, the following sets the maximum number of threads for query execution to 5:\\n```\\nCALL THREADS=5;\\n```\\n\\nMore configuration options can be found [here](https://kuzudb.com/docusaurus/cypher/configuration).\\n\\n## Data Types\\n\\n### `BLOB`\\n\\nWe have also added the `BLOB` type to store arbitrary binary objects. Here is an example query returning a blob:\\n\\n```\\nRETURN BLOB(\'\\\\\\\\xBC\\\\\\\\xBD\\\\\\\\xBA\\\\\\\\xAA\') as result;\\n---------------------------------------------\\n| result |\\n---------------------------------------------\\n| \\\\xBC\\\\xBD\\\\xBA\\\\xAA |\\n---------------------------------------------\\n```\\n\\nMore information on the blob data type can be found [here](https://kuzudb.com/docusaurus/cypher/data-types/blob).\\n\\n## Client APIs: Rust and Java\\nIn this release, we\'re expanding the accessibility of K\xf9zu, bridging the gap with some of the most popular programming languages in the developer community. Specifically, we now have [Rust](https://kuzudb.com/docusaurus/client-apis/rust) and [Java](https://kuzudb.com/docusaurus/client-apis/java) APIs.\\n\\n## Development: Testing Framework\\nStarting with this release, we\'re adding some development guidelines to encourage and facilitate outside contributions from the broader open source community.\\n\\nTesting is a crucial part of K\xf9zu to ensure the correct functioning of the system.\\nIn this release, we\'ve implemented significant changes to our testing framework. Our approach to testing is rooted in the principle of end-to-end tests rather than individual unit tests.\\nWhenever possible, we route all tests in the end-to-end way through Cypher statements. \\nTo this end, we\'ve designed a custom testing framework that enables thorough end-to-end testing via Cypher statements.\\n\\nOur testing framework draws inspiration from [SQLLogicTest](https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki), albeit with customized syntax tailored to our needs.\\nFor a more detailed overview of our testing framework, please visit [here](https://kuzudb.com/docusaurus/development/testing-framework)."},{"id":"kuzu-0.0.4-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.4-release","source":"@site/blog/2023-06-05-kuzu-v-0.0.4.md","title":"K\xf9zu 0.0.4 Release","description":"We are very happy to release K\xf9zu 0.0.4 today! This release comes with the following new main features and improvements:","date":"2023-06-05T00:00:00.000Z","formattedDate":"June 5, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":7.01,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.4-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.5 Release","permalink":"/docusaurus/blog/kuzu-0.0.5-release"},"nextItem":{"title":"Scaling Pytorch Geometric GNNs With K\xf9zu","permalink":"/docusaurus/blog/kuzu-pyg-remote-backend"}},"content":"We are very happy to release K\xf9zu 0.0.4 today! This release comes with the following new main features and improvements: \\n- [Data Ingestion Improvements](2023-06-05-kuzu-v-0.0.4.md#data-ingestion-improvements)\\n- [New Cypher Features](2023-06-05-kuzu-v-0.0.4.md#new-cypher-features)\\n - [Undirected Relationships in Queries](2023-06-05-kuzu-v-0.0.4.md#undirected-relationships-in-queries)\\n - [Recursive Queries: Shortest Path Queries and Improved Variable-length Queries](2023-06-05-kuzu-v-0.0.4.md#recursive-queries-shortest-path-queries-and-improved-variable-length-queries)\\n- [New Data Types](2023-06-05-kuzu-v-0.0.4.md#new-data-types)\\n - [`SERIAL`](2023-06-05-kuzu-v-0.0.4.md#serial)\\n - [`STRUCT`](2023-06-05-kuzu-v-0.0.4.md#struct)\\n- [Client APIs](2023-06-05-kuzu-v-0.0.4.md#client-apis)\\n - [Windows compatibility](2023-06-05-kuzu-v-0.0.4.md#windows-compatibility)\\n - [C](2023-06-05-kuzu-v-0.0.4.md#c)\\n - [Node.js](2023-06-05-kuzu-v-0.0.4.md#nodejs)\\n\x3c!--truncate--\x3e\\n\\n\\n## Data Ingestion Improvements\\nWe continue to improve our data ingestion in this release. \\nWe still rely on Apache Arrow to parse parquet and csv files.\\nSeveral bottlenecks in our earlier implementation are identified and optimized now, including copying from arrow arrays and construction of hash indexes.\\nWe now also store null bits separately, which simplifies our loading logic and makes it faster.\\n\\nHere are some benchmark numbers for loading two node and two rel tables that only contain primitive types or strings from the LDBC benchmark:\\n\\n- CPU: MAC M1 MAX\\n- Disk: 2TB SSD\\n- System Memory: 32GB\\n- Dataset: LDBC-100\\n- Number of thread: 10\\n\\n| Files | # lines | file size | v0.0.3 | v0.0.4\\n| ----------- | ----------- | ----------- | ----------- | ----------- |\\n| comment.csv | 220M | 22.49 GB | 890s | **108s (8.2x)** |\\n| post.csv | 58M | 7.68 GB | 304s | **32s (9.5x)** |\\n| likesComment.csv | 242M | 13 GB | 772s | **142s (5.4x)** |\\n| knows.csv | 20M | 1.1 GB | 80s | **21s (3.8x)** |\\n\\nBesides performance improvement, we now also allow interrupting `COPY` statements in the shell.\\nYou can interrupt long running `COPY` statements without crashing the shell.\\n\\nWe will continue to improve our data ingestion to make it more efficient and robust as we\'re moving to the [new storage design](https://github.com/kuzudb/kuzu/issues/1474) in the coming releases. Please stay tuned!\\n\\n## New Cypher Features\\n\\n### Undirected Relationships in Queries\\nK\xf9zu now supports undirected relationships in Cypher queries. An undirected relationship is the union of both in-coming and out-going relationships. This feature is mostly useful in the following two cases. \\n\\n**Case 1: Relationship is undirected by nature**\\nRelationships between nodes in K\xf9zu are currently directed (though we are internally debating to add a native undirected relationship type). A relationship file must contain `FROM` and `TO` columns each of which refers to a primary key column of a node table. However, sometimes the nature of the relationships are undirected, e.g., an `isFriendOf` relationships in a social network. \\n\\nCurrently, you have two options: (1) you can either store each friendship twice, e.g., `Alice isFriendOf Bob` and `Bob isFriendOf Alice`. This is a bad choice because internally K\xf9zu will index each edge twice (in the forward and backward) edges, so this one fact ends up getting stored 4 times. Or (2) you can store it once, say `Alice isFriendOf Bob`. \\n\\nThe advantage of option (1) was that in K\xf9zu v 0.0.3, if you want to find all friends of `Alice`, you could simply ask this query:\\n```\\nMATCH (a:Person)-[:isFriendOf]->(b:Person)\\nWHERE a.name = \'Alice\' RETURN b;\\n```\\nInstead, if you chose option (2), you would have to ask two queries, one to `MATCH (a:Person)-[:isFriendOf]->(b:Person)` and the other to `MATCH (a:Person)<-[:isFriendOf]-(b:Person)`, and `UNION` them, which gets messy if you want to do more with those neighbors (e.g., find their neighbors etc.). \\n\\nWith undirected edge support, you can now choose option (2) and find `Alice`\'s friends with:\\n```\\nMATCH (a:Person)-[:isFriendOf]-(b:Person)\\nWHERE a.name = \'Alice\'\\nRETURN b;\\n```\\nSo if you do not specify a direction in your relationships, K\xf9zu will automatically query both the forward and backward relationships for you.\\n\\n*Note from K\xf9zu developers: As noted above, we are debating a native undirected relationship type. That seems to solve the problem of, in which fake direction should an undirected relationship be saved at? Should be a `Alice-[isFriendOf]->Bob` or vice versa. Happy to hear your thoughts on this.*\\n\\n**Case 2: Relationship direction is not of interest**\\nAlthough relationship is stored in a directed way, its direction may not be of interest in the query. The following query tries to find all comments that have interacted with comment `K\xf9zu`. These comments could be either replying to or replied by `K\xf9zu`. The query can be asked naturally in an undirected way.\\n\\n```\\nMATCH (c:Comment)-[:replyOf]-(other:Comment)\\nWHERE c.author = \'K\xf9zu\'\\nRETURN other;\\n```\\n\\n### Recursive Queries: Shortest Path Queries and Improved Variable-length Queries\\nThis release brings in the beginnings of a series of major improvements we will do to recursive joins.\\nThe two major changes in this release are: \\n\\n**Multilabeled and undirected Variable-length Join Queries**\\nPrior to this release we supported variable-length join queries only in the restricted case when the variable-length relationship could have a single relationship label and was directed. For example you could write this query:\\n```\\nMATCH (a:Person)-[:knows*1..2]->(b:Person)\\nWHERE a.name = \'Alice\' \\nRETURN b\\n```\\nBut you couldn\'t ask for arbitrary labeled variable-length relationships between Persons `a` and `b` (though you\\ncould write the non-recursive version of that query: `MATCH (a:Person)-[:knows]->(b:Person) ...`. \\nSimilarly we did not support undirected version of the query: `MATCH (a:Person)-[:knows*1..2]-(b:Person)`.\\nK\xf9zu now supports multi-label as well as undirected variable-length relationships.\\nFor example, the following query finds all nodes that are reachable within 1 to 3 hops from `Alice`, irrespective\\nof the labels on the connections or destination `b` nodes:\\n```\\nMATCH (a:Person)-[e:*1..3]-(b)\\nWHERE a.name = \'Alice\'\\nRETURN b;\\n```\\n\\n**Shortest path**\\n\\nFinally, we got to implementing an initial version of shortest path queries. You can find (one of the) shortest paths between nodes by adding the `SHORTEST` keyword to a varible-length relationship. The following query asks for a shortest path between `Alice` and all active users that `Alice` follows within 10 hops and return these users, and the length of the shortest path.\\n\\n```\\nMATCH (a:User)-[p:Follows* SHORTEST 1..10]->(b:User)\\nWHERE a.name = \'Alice\' AND b.state = \'Active\'\\nRETURN b, p, length(p)\\n```\\n\\nThe `p` in the query binds to the sequences of relationship, node, relationship, node, etc. Currently we only return the internal IDs of the relationships and nodes (soon, we will return all their properties).\\n\\n## New Data Types\\n\\n### `SERIAL`\\nThis release introduces `SERIAL` data type. Similar to `AUTO_INCREMENT` supported by many other databases, `SERIAL` is mainly used to create \\nan incremental sequence of unique identifier column which can serve as a primary key column.\\n\\nExample:\\n\\n`person.csv`\\n```\\nAlice\\nBob\\nCarol\\n```\\n\\n```\\nCREATE NODE TABLE Person(ID SERIAL, name STRING, PRIMARY KEY(ID));\\nCOPY Person FROM `person.csv`;\\nMATCH (a:Person) RETURN a;\\n```\\nOutput:\\n```\\n-------------------------------------------\\n| a |\\n-------------------------------------------\\n| (label:Person, 3:0, {ID:0, name:Alice}) |\\n-------------------------------------------\\n| (label:Person, 3:1, {ID:1, name:Bob}) |\\n-------------------------------------------\\n| (label:Person, 3:2, {ID:2, name:Carol}) |\\n-------------------------------------------\\n```\\n\\nWhen the primary key of your node tables are already consecutive integers starting from 0, you should omit the primary key column in the input file and make primary key a SERIAL type. This will improve loading time significantly. Similarly, queries that need to scan primary key will also get faster. That\'s because internally we will not store a HashIndex or primary key column so any scan over primary key will not trigger a disk I/O.\\n\\n### `STRUCT`\\nK\xf9zu now supports `STRUCT` data type similar to [composite type](https://www.postgresql.org/docs/current/rowtypes.html) in Postgres. Here is an example:\\n\\n```\\nWITH {name:\'University of Waterloo\', province:\'ON\'} AS institution\\nRETURN institution.name AS name;\\n```\\nOutput:\\n```\\n--------------------------\\n| name |\\n--------------------------\\n| University of Waterloo |\\n--------------------------\\n```\\nWe support storing structs as node properties for now. For example you can create: `CREATE NODE TABLE Foo(name STRING, exStruct STRUCT(x INT16, y STRUCT(z INT64, w STRING)), PRIMARY KEY (name))`. We will support storing structs on relationships soon. As shown in the `CREATE NODE` example above, you can store arbitrarily\\nnested structs, e.g., structs that contain structs as a field, on nodes. One missing feature we have for now is storing and processing a `LIST` composite type. \\n\\n**Note**: Updating `STRUCT` column with update statement is not supported in this release but will come soon.\\n\\n## Client APIs\\n\\n### Windows compatibility\\nDevelopers can now build K\xf9zu from scratch on Windows platform! Together with this release we also provide pre-built libraries and python wheels on Windows.\\n\\n### C\\nWe provide official C language binding in this release. Developers can now embed K\xf9zu with native C interfaces.\\n\\n### Node.js\\nWe provide official Node.js language binding. With Node.js API, developer can leverage K\xf9zu analytical capability in their Node.js projects. We will\\nsoon follow this blog post with one (or a few) blog posts on developing some applications with Node.js."},{"id":"kuzu-pyg-remote-backend","metadata":{"permalink":"/docusaurus/blog/kuzu-pyg-remote-backend","source":"@site/blog/2023-05-10-kuzu-pyg-rb.md","title":"Scaling Pytorch Geometric GNNs With K\xf9zu","description":"In this post, we\'ll walk through how to use K\xf9zu as a Pytorch Geometric (PyG) Remote Backend to train a GNN model on very large graphs that do not fit on your machine\'s RAM.","date":"2023-05-10T00:00:00.000Z","formattedDate":"May 10, 2023","tags":[{"label":"use-case","permalink":"/docusaurus/blog/tags/use-case"}],"readingTime":12.39,"hasTruncateMarker":true,"authors":[{"name":"Chang Liu","url":"https://www.linkedin.com/in/mewim/","imageURL":"https://kuzudb.com/img/blog/chang.gif","key":"chang"},{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"kuzu-pyg-remote-backend","authors":["chang","semih"],"tags":["use-case"]},"prevItem":{"title":"K\xf9zu 0.0.4 Release","permalink":"/docusaurus/blog/kuzu-0.0.4-release"},"nextItem":{"title":"K\xf9zu 0.0.3 Release","permalink":"/docusaurus/blog/kuzu-0.0.3-release"}},"content":"In this post, we\'ll walk through how to use K\xf9zu as a [Pytorch Geometric (PyG) _Remote Backend_](https://pytorch-geometric.readthedocs.io/en/latest/advanced/remote.html) to train a GNN model on very large graphs that do not fit on your machine\'s RAM. \\n\\n\\nLet\'s start with a quick overview of PyG Remote Backends: PyG Remote Backends are plug-in replacements for PyG\'s in-memory graph and feature stores, so they can be used seamlessly with the rest of the PyG interfaces to develop your GNN models. If a PyG Remote Backend is a disk-based storage system, such as K\xf9zu, PyG will fetch subgraphs from K\xf9zu, which stores and scans its data from disk, allowing you to train models on very large graphs for which PyG\'s in-memory storage would run out of memory and fail.\\n\\n\x3c!--truncate--\x3e\\n\\nAs you\'ll see, if you already have PyG models you have developed in Python, replacing PyG\'s default storage with K\xf9zu is extremely simple. ***It \\nconsists of loading your graph into K\xf9zu and then changing 1 line of code in your PyG model***. To demonstrate how simple this is and how it performs,\\nse will follow this [Sample Code](https://github.com/pyg-team/pytorch_geometric/tree/master/examples/kuzu/papers_100M) to demonstrate how to do this.\\nSo let\'s get to it!\\n\\n## Dataset, Predictive Task, and GNN Model\\n\\nLet\'s start by describing our graph dataset, our predictive task, and the GNN model we will use for the predictive task.\\n\\n**Dataset**: We will use the `ogbn-papers100M` dataset of ~100M nodes and ~2.5B edges from the [Open Graph Benchmark](https://ogb.stanford.edu/) (OGB). To find the dataset,\\nyou can search for \\"ogbn-papers100M\\" [here](https://ogb.stanford.edu/docs/nodeprop/). The dataset takes about 128GB of RAM when using PyG\'s default in-memory storage. The graph\'s nodes and edges model the following:\\n\\n_Nodes_ are papers that have these properties:\\n\\n- `ID`: an int64 node identifier\\n- `year`: the publication date of the paper (you can ignore this as it will not be used in our example but this property is part of the dataset)\\n- `x`: 128-dimensional node features (so 128-size float tensors)\\n- `y`: a numeric label indicating the category/field of the paper. These numbers indicate different [arXiv categories](https://arxiv.org/category_taxonomy) for\\n papers. Although the exact mapping is not important, you can think of these for example as 0 indicating \\"physics\\", 2 indicating \\"geometry\\" etc.\\n\\n_Edges/Relationships_ are citations between papers and do not contain any properties.\\n\\n**Predictive task:** Predict the `y` labels of nodes using the node features stored in the `x` properties.\\n\\n**GNN Model**: We will train a 3-layer GraphSage model that contains 5.6 million parameters to perform this predictive task. Our model is based on the implementation [here](https://github.com/mengyangniu/ogbn-papers100m-sage/tree/main). We picked this model because it was one of the better-performing models in the [PyG Leaderboard for the ogbn-papers100M dataset](https://ogb.stanford.edu/docs/leader_nodeprop/) (search \\"GraphSAGE_res_incep\\" under \\"Leaderboard for ogbn-papers100M\\") that we could develop using pre-existing layers in the PyG library (so we do not have to write any custom layers).\\n\\n## Step 1: Preliminaries and Loading ogbn-papers100M into K\xf9zu\\n\\nAs a preliminary, the [`prepare_data.py`](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/kuzu/papers_100M/prepare_data.py) script in [Sample Code](https://github.com/pyg-team/pytorch_geometric/tree/master/examples/kuzu/papers_100M) generates four numpy files for each property of the papers: (i) `./ids.npy`; (ii) `./node_feat.npy` (storing `x` properties); (iii) `./node_year.npy`; and (iv) `./node_label.npy` (storing `y` labels). In addition, it will generate an `./edge_index.csv` file that stores the citation relationships. In the below code snippets, we will assume you have gone through those steps.\\n\\nLet\'s start with how you load the `ogbn-papers100M` dataset into K\xf9zu. You will first need to define a `paper` NODE TABLE and a `cite` REL TABLE, whose schemas will follow exactly the structure of the dataset and then use `COPY FROM` statements in K\xf9zu\'s version of Cypher to ingest those numpy and csv files into your `paper` and `cite` tables:\\n\\n```\\n...\\nimport kuzu\\nimport numpy as np\\n...\\n\\nprint(\\"Creating an empty K\xf9zu database under the papers100M directory...\\")\\ndb = kuzu.Database(\'papers100M\')\\nconn = kuzu.Connection(db, num_threads=cpu_count())\\nprint(\\"Creating K\xf9zu tables...\\")\\nconn.execute(\\n \\"CREATE NODE TABLE paper(id INT64, x FLOAT[128], year INT64, y FLOAT, \\"\\n \\"PRIMARY KEY (id));\\")\\nconn.execute(\\"CREATE REL TABLE cites (FROM paper TO paper, MANY_MANY);\\")\\nprint(\\"Copying nodes to K\xf9zu tables...\\")\\nconn.execute(\'COPY paper FROM (\\"%s\\", \\"%s\\", \\"%s\\", \\"%s\\") BY COLUMN;\' %\\n (\'./ids.npy\', \'./node_feat.npy\', \'./node_year.npy\', \'./node_label.npy\'))\\nprint(\\"Copying edges to K\xf9zu tables...\\")\\nconn.execute(\'COPY cites FROM \\"%s\\";\' % (\'./edge_index.csv\'))\\nprint(\\"All done!\\")\\n```\\n\\nThe one important note here is that you should store your node features using [K\xf9zu\'s FIXED-LIST data type](https://kuzudb.com/docs/cypher/data-types/list.html) using `FLOAT[128]` syntax (instead of the less efficient VAR-LIST data type, which uses `FLOAT[]` syntax for lists that can have different lengths). FIXED-LIST is a data type that we specifically added to K\xf9zu to efficiently store node features and embeddings in graph ML applications.\\n\\n## Step 2: Get K\xf9zu Remote Backend by Calling `db.get_torch_geometric_remote_backend()`\\n\\nAfter loading your data to K\xf9zu, the only thing you have to do is to call the `get_torch_geometric_remote_backend()` function on your Database object `db`:\\n\\n```\\nfeature_store, graph_store = db.get_torch_geometric_remote_backend(multiprocessing.cpu_count())\\n```\\n\\nThis function returns two objects that implement PyG\'s Remote Backend interfaces: (i) `feature_store` is an instance of [`torch_geometric.data.FeatureStore`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.FeatureStore.html#torch_geometric.data.FeatureStore); and (ii) `graph_store` is an instance of [`torch_geometric.data.GraphStore`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.GraphStore.html#torch_geometric.data.GraphStore). These two handles are your K\xf9zu Remote Backends that you can pass to your PyG models/subgraph samplers and they will make your existing PyG models work seamllessly with K\xf9zu! That\'s all\\nyou really have to know about how to use K\xf9zu as a Remote Backend. ***There is no more K\xf9zu functions you have to call in the rest of the demonstration. You only have\\nto do 1 line of code change in your regular PyG code.***\\nThe rest of the example contains standard code you normally write to develop your PyG models.\\n\\n## Step 3: Define & Pass K\xf9zu\'s `feature_store` and `graph_store` to your GNN Model\\n\\nFirst, we\'ll define the GraphSage model in PyG. We\'ll put `...`\'s here and there to shorten the example because, as we said above, this is your regular PyG code:\\n\\n```\\n# Define the model for training. The model is ported from\\n# https://github.com/mengyangniu/ogbn-papers100m-sage\\nclass SAGE(nn.Module):\\n def __init__(self, in_feats, n_hidden, n_classes, n_layers, activation,\\n dropout):\\n super().__init__()\\n self.n_layers = n_layers\\n ...\\n\\n def forward(self, edge_list, x):\\n ...\\n for layer_index, layer in enumerate(self.layers):\\n ....\\n return self.mlp(collect)\\n```\\n\\nNext, we will enable PyG to use K\xf9zu\'s Remote Backend when training. We create a [`torch_geometric.loader.NeighborLoader`](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/loader/neighbor_loader.html), which is the subgraph sampler we will use, and pass the `feature_store` and `graph_store` we obtained from K\xf9zu to it. ***This is the 1 line change you have to do!***\\n\\n```\\n# Plug the graph store and feature store into the NeighborLoader\\nkuzu_sampler = NeighborLoader(\\n data=(feature_store, graph_store),\\n num_neighbors={(\'paper\', \'cites\', \'paper\'): [12, 12, 12]},\\n batch_size=LOADER_BATCH_SIZE,\\n input_nodes=(\'paper\', input_nodes),\\n num_workers=4,\\n filter_per_worker=False,\\n)\\n```\\n\\n**`data=(feature_store, graph_store)`** is the important line. When you use this sampler in training to construct mini-batches, it will perform subgraph sampling and load the required node features from K\xf9zu automatically and return a [`torch_geometric.data.HeteroData`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.HeteroData.html) object, which can be directly plugged into a GNN model. That training code looks like this (again abbreviated because this is all PyG code):\\n\\n```\\nmodel = SAGE(128, 1024, 172, 3, torch.nn.functional.relu, 0.2)\\n...\\noptimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\\ncriterion = torch.nn.CrossEntropyLoss()\\n\\nfor epoch in range(NUM_EPOCHS):\\n i = 0\\n start_time = time.time()\\n // **The below for loop line is where we ask the sampler to\\n // sample a mini batch\\n for b in kuzu_sampler:\\n x = b[\'paper\'][\'x\']\\n y = b[\'paper\'][\'y\']\\n edge_index = b[\'paper\', \'cites\', \'paper\'].edge_index\\n ...\\n model.train()\\n optimizer.zero_grad()\\n out = model(edge_index, x)\\n loss = criterion(out, y)\\n loss.backward()\\n optimizer.step()\\n ...\\n i += 1\\n```\\n\\n`for b in kuzu_sampler:` is the exact line where the sampler will end up calling on K\xf9zu to sample a subgraph and scan the features of the nodes in that subgraph. This all ends up using K\xf9zu\'s disk-based storage, allowing you to train GNNs on graphs that don\'t fit on your RAM. One distinct advantage of K\xf9zu is that, because it is an embeddable DBMS, \\nwe can do the conversion of scanned node features from K\xf9zu into PyG\'s tensors as a zero-copy operation. We simply write the scanned node features into a buffer array allocated in Python without any additional data transfer between the systems.\\n\\nCurrently, only the `feature_store` scans data from K\xf9zu\'s disk-based storage. For `graph_store`, our current implementation stores the entire graph topology in COO format in memory. This does limit how much you can scale, but in many models trained on large graphs, features take up more space than the graph topology, so scaling node features out of memory should still allow you to scale to very lage graphs that won\'t fit in your RAM.\\n\\n### Adjusting K\xf9zu\'s Buffer Pool Size\\n\\nAs with most DBMSs, K\xf9zu has a Buffer Manager that maintains a buffer pool to keep parts of the database in memory. When you use K\xf9zu, you decide how much memory to allocate to it. The more memory you give to K\xf9zu, the less I/O it will perform on scans. So, in the context of this post, the larger the buffer manager size you set, the faster your training time will be when training large graphs out of memory. You set K\xf9zu\'s buffer pool size when you construct your `Database` object, before you call the `get_torch_geometric_remote_backend()` function. For example, the code below sets the BM size to `40 * 1024**3` bytes, which is equal to 40GB. You should set it as high as possible without running out of memory for performance reasons.\\n\\n```\\nKUZU_BM_SIZE = 40 * 1024**3\\n# Create kuzu database\\ndb = kuzu.Database(\\"papers100M\\", KUZU_BM_SIZE)\\nfeature_store, graph_store = db.get_torch_geometric_remote_backend(\\n mp.cpu_count())\\n```\\n\\n## An Experiment Demonstrating Throughput Numbers With Different Buffer Pool Sizes\\n\\nLet\'s demonstrate what troughput numbers you can expect under different memory settings.\\nAs a baseline we will first measure the throughput of training\\nas time/batch using PyG\'s default in-memory\\nstorage. This seting uses ~106GB of memory.\\nWe will then simulate limited memory settings by training the same\\nmodel using K\xf9zu Remote Backend and limiting K\xf9zu\'s buffer pool size to\\ndifferent levels.\\nHere are the important configurations for the experiment:\\n\\n- Available RAM in the machine: 384GB RAM\\n- CPU: Two Xeon Platinum 8175M (48 cores/96 threads)\\n- GPU: RTX 4090 with 24GB GPU memory\\n- SSD in the system for disk storage: 2TB Kingston KC3000 NVMe SSD\\n- Mini-batch size: 1152. Recall the `kuzu_sampler = NeighborLoader(...)` that we defined above. There we gave this argument\\n `num_neighbors={(\'paper\', \'cites\', \'paper\'): [12, 12, 12]}` to the `NeighborLoader`, which means that the sampler will sample 3-degree neighbors of these 1152 nodes,\\n sampling 12 neighbors at each degree.\\n We picked 1152 as our mini-batch size because this is the size at which we generate batches that take a peak of 23GB of memory, so beyond this we would run out of GPU memory. [^1]\\n- \\\\# PyG Workers: 16 (we did a parameter sweep and setting this to 4, 8, 16 perform very similarly)\\n- \\\\# K\xf9zu Query Processor Threads: 24 (48 and 96 also perform similarly)\\n\\nWe will run K\xf9zu with 60GB, 40GB, 20GB, and 10GB buffer pool size.\\nThe lower K\xf9zu\'s buffer pool size, the more\\ndisk I/Os K\xf9zu will perform. Note however that in this experiment K\xf9zu will use more memory than\\nthese sizes for two reasons: (i) K\xf9zu stores some parts of the database always in memory\\nthough this is not very important in this setting; (ii) As we said, currently\\nK\xf9zu Remote Backend uses in-memory storage for the graph topology (but not node features!),\\nwhich takes ~48GB of RAM. So you can roughly think of K\xf9zu using 48 + BM size in these experiments.\\n\\nWe will do 500 batches of training and report the throughput number as average end-to-end time/batch.\\nWe also report the time that\'s spent on GPU for Training as `Training Time (s)` and\\ntime spent on copying data from CPU to GPU as `CPU-to-GPU Copying Time (s)`. For\\nK\xf9zu configurations, you can roughly\\ninterpret `Per Batch Time (s) - Training Time (s) - CPU-to-GPU Copying Time (s)`\\nas the time spent for scanning data from K\xf9zu into CPU\'s memory. We expect that to increase\\nas we lower the BM size.\\n\\n| Configuration | Per Batch Time (s) | Training Time (s) | CPU-to-GPU Copying Time | Time Scanning Data from K\xf9zu | Memory Usage |\\n| ----------------------------- | ------------------ | ----------------- | ----------------------- | ---------------------------- | ------------ |\\n| PyG In-memory | 0.281 | 0.240 | 0.024 | --- | ~110 GB |\\n| K\xf9zu Remote Backend (bm=60GB) | 0.380 (1.35x) | 0.239 | 0.018 | 0.123 | ~110 GB |\\n| K\xf9zu Remote Backend (bm=40GB) | 0.513 (1.82x) | 0.239 | 0.022 | 0.251 | ~90 GB |\\n| K\xf9zu Remote Backend (bm=20GB) | 1.162 (4.88x) | 0.238 | 0.022 | 0.901 | ~70 GB |\\n| K\xf9zu Remote Backend (bm=10GB) | 1.190 (4.23x) | 0.238 | 0.022 | 0.930 | ~60 GB |\\n\\nSo, when have enough memory, there is about 1.35x slow down (from 0.281s to 0.380s per batch)\\ncompared to using PyG\'s default storage. This\\nis the case when K\xf9zu has enough buffer memory (60GB) to store the features but we still incur the cost of\\nscanning them through K\xf9zu\'s buffer manager. So no disk I/O happens (except the first time\\nthe features are scanned to the buffer manager). When we use 40GB of buffer pool and below, we start doing some I/O,\\nand the average time per batch degrade to 0.513, 1.162, amd 1.190 respectively when using 40GB, 20GB, and 10GB.\\nWe seem to stabilize around 4x degradation at 10GB or 20GB level, where most of the feature scans\\nare now happening from disk. These numbers hopefully look good for many settings!\\n\\n## Next Steps\\n\\nWe will be doing 2 immediate optimizations in the next few releases\\nrelated to K\xf9zu\'s PyG integration.\\nFirst, we will change our `graph_store` to use an in DBMS subgraph sampler, so we can virtually work at any limited memory level.\\nSecond, in an even earlier release, we had a more basic PyG integration feature, the\\n[`QueryResult.get_as_torch_geometric()`](https://kuzudb.com/docs/client-apis/python-api/query-result.html#query_result.QueryResult.get_as_torch_geometric) function.\\nThis feature is more of an ETL feature. It is designed for cases where you want to filter\\na subset of your nodes and edges and convert them directly into PyG `HeteroData` objects (i.e., use PyG\'s default in-memory storage)\\nas you build PyG pipelines using graph databases you store in K\xf9zu.\\nIf you are converting a large graph this can be quite slow, and we will be improving this so that such ETL pipelines\\nare much faster!\\n\\nWe are excited to hear about your feedback on K\xf9zu\'s PyG integration features and get more ideas about\\nhow else we can help users who are building GNN pipelines. Please reach out to us over [K\xf9zu Slack](https://join.slack.com/t/kuzudb/shared_invite/zt-1w0thj6s7-0bLaU8Sb~4fDMKJ~oejG_g)\\nfor your questions and ideas!.\\n\\n[^1]:\\n If you read our [v0.0.3 blog post](https://kuzudb.com/blog/kuzu-0.0.3-release.html#k%C3%B9zu-as-a-pyg-remote-backend),\\n which had a shorter section about PyG interface, you will notice that we used a much larger batch size there (48000),\\n which was the size that saturated GPU memory. Although the example there was also on the `ogbn-papers100M` dataset, we used a much smaller model with ~200K parameters\\n and sampled subgraphs from 2 degree neighbors of these batches. Now we use a much larger model with 5.6 million parameters and samples from 3-degree neighbors."},{"id":"kuzu-0.0.3-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.3-release","source":"@site/blog/2023-04-06-kuzu-v-0.0.3.md","title":"K\xf9zu 0.0.3 Release","description":"We are happy to release K\xf9zu 0.0.3 today. This release comes with the following new main features and improvements:","date":"2023-04-06T00:00:00.000Z","formattedDate":"April 6, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":10.44,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.3-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"Scaling Pytorch Geometric GNNs With K\xf9zu","permalink":"/docusaurus/blog/kuzu-pyg-remote-backend"},"nextItem":{"title":"Why (Graph) DBMSs Need New Join Algorithms: The Story of Worst-case Optimal Join Algorithms","permalink":"/docusaurus/blog/wcoj"}},"content":"We are happy to release K\xf9zu 0.0.3 today. This release comes with the following new main features and improvements:\\n- [K\xf9zu as a Pytorch Geometric (PyG) Remote Backend](2023-04-06-kuzu-v-0.0.3.md#k\xf9zu-as-a-pyg-remote-backend): You can now train PyG GNNs and other models directly using graphs (and node features) stored on K\xf9zu. See this [Colab notebook](https://colab.research.google.com/drive/12fOSqPm1HQTz_m9caRW7E_92vaeD9xq6)\\nfor a demonstrative example. \\n- [Data ingestion from multiple files and numpy files](2023-04-06-kuzu-v-0.0.3.md#data-ingestion-improvements): See below for details\\n- [Query optimizer improvements](2023-04-06-kuzu-v-0.0.3.md#query-optimizer-improvements): See below for details\\n- [New buffer manager](2023-04-06-kuzu-v-0.0.3.md#new-buffer-manager): A new state-of-art buffer manager based on [VMCache](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf).\\n- [INT32, INT16, FLOAT, and FIXED LIST data types](2023-04-06-kuzu-v-0.0.3.md#new-data-types) (the latter is particularly suitable to store node features in graph ML applications)\\n- [Query timeout mechanism and interrupting queries from CLI](2023-04-06-kuzu-v-0.0.3.md#other-system-functionalities).\\n\\n\x3c!--truncate--\x3e\\n\\nFor installing the new version, \\nplease visit the [download section of our website](https://kuzudb.com/#download) \\nand [getting started guide](https://kuzudb.com/docs/getting-started.html) and the full\\n[release notes are here](https://github.com/kuzudb/kuzu/releases). Please visit\\nthe [Colab Notebooks](https://kuzudb.com/docs/getting-started/colab-notebooks) section of our\\ndocumentation website to play with our [Colab notebooks](https://kuzudb.com/docs/getting-started/colab-notebooks).\\n\\n\\nEnjoy! Please give us a try, [a Github \u2b50](https://github.com/kuzudb/kuzu) and your feedback and feature requests! Also follow\\nus on [Twitter](https://twitter.com/kuzudb)!\\n\\n## K\xf9zu as a PyG Remote Backend\\nK\xf9zu now implements PyG\'s Remote Backend interface. So you can directly \\ntrain GNNs using K\xf9zu as your backend storage. Quoting [PyG documentation\'s](https://pytorch-geometric.readthedocs.io/en/latest/advanced/remote.html) description\\nof the Remote Backend feature:\\n\\n> ...[this feature enables] users to train GNNs on graphs far larger than the size of their\\nmachine\u2019s available memory. It does so by introducing simple, easy-to-use, and extensible abstractions of a `torch_geometric.data.FeatureStore` and a `torch_geometric.data.GraphStore` that plug directly into existing familiar PyG interfaces.\\n\\nWith our current release, once you store your graph and features in K\xf9zu,\\nPyG\'s samplers work seamlessly using K\xf9zu\'s implementation of `FeatureStore` and `GraphStore` interfaces. For example, \\nthis enables your existing GNN models to work seamlessly by fetching both subgraph samples and node features\\nfrom K\xf9zu instead of PyG\'s in-memory storage. \\nTherefore you can train graphs that do not\\nfit into your memory since K\xf9zu, as a DBMS, stores its data on disk. Try this demonstrative [Colab notebook](https://colab.research.google.com/drive/12fOSqPm1HQTz_m9caRW7E_92vaeD9xq6) to \\nsee an example of how to do this. The current release comes with a limitation that we only truly implement the `FeatureStore` interface.\\nInside `GraphStore` we still store the graph topology in memory. \\nSo in reality only the features are stored and scanned from disk. We plan to address this limitation later on.\\n\\nHere is also a demonstrative experiment (but certainly not comprehensive study) for the type of training performance \\nvs memory usage tradeoff you can expect. \\nWe trained a simple 3-layers Graph Convolutional Network (GCN) model on [ogbn-papers100M](https://ogb.stanford.edu/docs/nodeprop/#ogbn-papers100M) dataset, which contains about 111 million nodes\\nwith 128 dimensional node features and about 1.6 billion edges. \\nStoring the graph topology takes around 48GB[^1] and the features takes 53 GBs. Given our current limitation,\\nwe can reduce 53 GB to something much smaller (we will limit it to as low as 10GB).\\nWe used a machine with one RTX 4090 GPU with 24 GB of memory, two Xeon Platinum 8175M CPUs, and 384 GB RAM, which \\nis enough for PyG\'s in-memory store to store the entire graph and all features in memory.\\n\\nDuring training, we use the `NeighborLoader` of PyG with batch size of 48000 and sets the `num_neighbors` to `[30] * 2`, which means at each batch roughly 60 neighbor nodes of 48000 nodes will be sampled from the `GraphStore` and the features of those nodes will be scanned\\nfrom K\xf9zu\'s storage. We picked this sample size because this gives us a peak GPU memory usage of approximately 22 GB, i.e.,\\nwe can saturate the GPU memory. We used 16 cores[^2] during the sampling process. We run each experiment in a Docker instance\\nand limit the memory systematically from 110GB, which is enough for PyG to run completely in memory, down to 90, 70, and 60GB.\\nAt each memory level we run the same experiment by using K\xf9zu as a Remote Backend, where we \\nhave to use about 48GB to store the topology and give the remaining memory to K\xf9zu\'s buffer manager.\\nFor example when the memory is 60GB, we can only give ~10GB to K\xf9zu.\\n\\n| Configuration | End to End Time (s) | Per Batch Time (s) | Time Spent on Training (s) | Time Spent on Copying to GPU (s) | Docker Memory | \\n|-------------------------------|-----------------|-----------------|------------------------|------------------------------|-------------|\\n| PyG In-memory | 140.17 | 1.4 | 6.62 | 31.25 | 110 GB |\\n| K\xf9zu Remote Backend (bm=60GB) | 392.6 | 3.93 | 6.29 | 34.18 | 110 GB | \\n| K\xf9zu Remote Backend (bm=40GB) | 589.0 | 5.89 | 6.8 | 32.6 | 90 GB | \\n| K\xf9zu Remote Backend (bm=20GB) | 1156.1 | 11.5 | 6.0 | 36 | 70 GB | \\n| K\xf9zu Remote Backend (bm=10GB) | 1121.92 | 11.21 | 6.88 | 35.03 | 60 GB | \\n\\nSo, when have enough memory, there is about 2.8x slow down (from 1.4s to 3.93s per batch). This\\nis the case when K\xf9zu has enough buffer memory (60GB) to store the 53GB of features but we still incur the cost of \\nscanning them through K\xf9zu\'s buffer manager. So no or very little disk I/O happens (except the first time\\nthe features are scanned to the buffer manager). Then as we lower the memory, K\xf9zu can hold only part \\nof the of node features in its buffer manager, so\\nwe force K\xf9zu to do more and more I/O. The per batch time increase to 5.89s at 40GB of buffer manager size, \\nthen seems to stabilize around 11s (so around 8.2x slowdown). \\n\\nThe slow down is better if you use smaller batch sizes but for the end to end training time, you\\nshould probably still prefer to use larger batch sizes. This is a place where we would need to\\ndo more research to see how much performance is on the table with further optimizations.\\n\\nBut in summary, if you have \\nlarge datasets that don\'t fit on your current systems\' memories and would like to easily train your PyG models \\noff of disk (plus get all the usability features of a GDBMS as you prepare your datasets for training), \\nthis feature can be very useful for you!\\n\\n## Data Ingestion Improvements\\n\\n**Ingest from multiple files**: You can now load data from multiple files of the same type into a node/rel table in two ways:\\n - **file list**: `[\\"vPerson0.csv\\", \\"vPerson1.csv\\", \\"vPerson2.csv\\"]`\\n - **glob pattern matching**: Similar to Linux [Glob](https://man7.org/linux/man-pages/man7/glob.7.html), this will load files that matches the glob pattern.\\n\\n**Ingest from npy files**: We start exploring how to enable data ingesting in column by column fashion. Consider a `Paper` table defined in the following DDL.\\n```\\nCREATE NODE TABLE Paper(id INT64, feat FLOAT[768], year INT64, label DOUBLE, PRIMARY KEY(id));\\n```\\nSuppose your raw data is stored in npy formats where each column is represented as a numpy array on disk:\\n\\"node_id.npy\\", \\"node_feat_f32.npy\\", \\"node_year.npy\\", \\"node_label.npy\\".\\nYou can now directly copy from npy files where each file is loaded to a column in `Paper` table as follows:\\n```\\nCOPY Paper FROM (\\"node_id.npy\\", \\"node_feat_f32.npy\\", \\"node_year.npy\\", \\"node_label.npy\\") BY COLUMN;\\n```\\n\\n**Reduce memory consumption when ingesting data into node tables:**\\nThis release further optimizes the memory consumption during data ingestion of node tables.\\nWe no longer keep the whole node table in memory before flushing it to disk as a whole. Instead, we process a chunk of a file\\nand flush its corresponding pages, so incur only the memory cost of ingesting a chunk (or as many chunks as there are threads running).\\nThis greatly reduces memory usage when the node table is very large.\\n\\n## Query Optimizer Improvements\\n\\n**Projection push down for sink operator**:\\nWe now push down projections down to the first sink operator \\nabove the last point in a query plan they are needed.\\nConsider the following query\\n```\\nMATCH (a:person) WHERE a.age > 35 RETURN a.salary AS s ORDER BY s;\\n```\\nThis query\'s (simplified) plan is: `Scan->Filter->OrderBY->ResultCollector`, where both \\n`ORDER BY` and the final `ResultCollector` are sink operators. \\n`ResultCollector` is where we accumulate the expressions in the `RETURN` clause. \\nThis is simplified because `ORDER BY` actually consists of several physical operators. \\nBoth column `age` and `salary` are scanned initially but only `salary` is needed in `ResultCollector`. \\n`age`, which is needed by `Filter` is projected out in the `ResultCollector`. We now push the projection of `age`\\nto `ORDER BY`, so `ORDER BY` does not have to materialize it.\\n\\n**Other optimizations:** We implemented several other optimizations, such as we reorder the filter expressions so equality conditions\\nare evaluated first, several improvements to cardinality estimator, and improved sideway information passing for joins. For the latter, \\nin our core join operator, which we called ASP-Joins in our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf), we would blindly\\nperform sideways information passing (sip) from build to probe (or vice versa; \\nsee [our paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) for details). Sometimes if there is no \\nfilters on the probe and build sides, this is just an overhead as it won\'t decrease the amount of scans on either side. \\nIn cases where we think sip won\'t help reduce scans, we do vanilla Hash Joins now.\\n\\n## New Buffer Manager\\n\\nBefore this release, we had two internal buffer pools with 2 different frame sizes of 4KB and 256KB,\\nso operators could only grab buffers of these two sizes. Plus when you loaded your DB and wanted to allocate\\nsay 10GB buffer pool, we manually gave a fixed percentage to 4KB pool and the rest to 256KB pool. \\nThis didn\'t give any flexibility for storing large objects and complicated code to manage \\nbuffers when operators needed them. Terrible design; \\njust don\'t do this!\\n\\nWe bit the bullet and decided to read the literature and pick a state-of-art buffer manager design that is\\nalso practical. We switched to the mmap-based approach described in VMCache design from [this recent paper](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf) by Leis et al.. \\nThis is a very nice design \\nand makes it very easy to support multiple buffer sizes very easily and only uses hardware locks (we used \\nsoftware locks in our previous buffer manager). It also supports using optimistic reading,\\nwhich we verified improves our query performance a lot.\\n\\n## New Data Types\\n\\nWe now support several additional data types that were missing.\\n\\n**[FIXED-LIST](https://kuzudb.com/docs/cypher/data-types/list.html) data type:** This is important if you\'re doing graph ML and storing node features\\nin K\xf9zu. It is the efficient way to store fixed-length vectors. Here\'s the summary of how\\nto declare a node or rel property in your schemas to use the fixed-list data type.\\n\\n| Data Type | Description | DDL definition |\\n| --- | --- | --- | \\n| FIXED-LIST | a list of fixed number of values of the same numerical type | INT64[8] |\\n\\nWhen possible use FIXED LIST instead of regular [VAR-LIST](https://kuzudb.com/docs/cypher/data-types/list.html) data type\\nfor cases when you know the size of your lists/vectors. It\'s much more efficient.\\n\\nNote that FIXED-LIST is an experimental feature. Currently only bulk loading (e.g. `COPY` statement) and reading is supported.\\n\\n**INT32, INT16, FLOAT data types:** The release also comes with support for the following data types:\\n\\n| Data Type | Size | Description |\\n| --- | --- | --- |\\n| INT32| 4 bytes | signed four-byte integer |\\n| INT16| 2 bytes | signed two-byte integer |\\n| FLOAT | 4 bytes | single precision floating-point number |\\n\\nFor our next release, our focus on data types will be on complex ones, STRUCT and MAP. So stay tuned for those!\\n\\n## Other System Functionalities\\n\\n**Query timeout**: We will now automatically stop any query that exceeds a specified timeout value (if one exists). \\nThe default query timeout value is set to -1, which signifies that the query timeout feature is initially disabled. \\nYou can activate the query timeout by configuring a positive timeout value through:\\n - 1. C++ API: `Connection::setQueryTimeOut(uint64_t timeoutInMS)`\\n - 2. CLI: `:timeout [timeoutValue]`\\n\\n**Interrupt:** You can also interrupt your queries and can stop your long running queries manually. There\\nare two ways to do this:\\n - C++ API: `Connection::interrupt()`: interrupt all running queries within the current connection.\\n - CLI: interrupt through `CTRL + C`\\n\\nNote: The Interruption and Query Timeout features are not applicable to `COPY` commands in this release.\\n\\n[^1]: Internally, PyG coverts the edge list to CSC format for sampling, which duplicates the graph structures in memory. When you download the graph topology it actually takes about 24GB.\\n[^2]: We set `num_workers` to 16 when running the PyG in-memory setup. Since K\xf9zu does not currently work with multiple workers in Python, we limit `num_workers` to 1 when sampling from K\xf9zu but internally K\xf9zu scans in parallel with 16 threads."},{"id":"wcoj","metadata":{"permalink":"/docusaurus/blog/wcoj","source":"@site/blog/2023-02-22-wcoj/index.md","title":"Why (Graph) DBMSs Need New Join Algorithms: The Story of Worst-case Optimal Join Algorithms","description":"Joins of a sets of records is objectively the most expensive operation in DBMSs.","date":"2023-02-22T00:00:00.000Z","formattedDate":"February 22, 2023","tags":[{"label":"internals","permalink":"/docusaurus/blog/tags/internals"}],"readingTime":20.76,"hasTruncateMarker":true,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"wcoj","authors":["semih"],"tags":["internals"]},"prevItem":{"title":"K\xf9zu 0.0.3 Release","permalink":"/docusaurus/blog/kuzu-0.0.3-release"},"nextItem":{"title":"K\xf9zu 0.0.2 Release","permalink":"/docusaurus/blog/kuzu-0.0.2-release"}},"content":"import WcojRunningExDataImage from \'./wcoj-running-ex-data.png\';\\nimport WcojEdgeCoversImage from \'./wcoj-edge-covers.png\';\\nimport WcojGjSimulationImage from \'./wcoj-gj-simulation.png\';\\nimport WcojKuzuMultiwayHashJoinImage from \'./wcoj-kuzu-multiway-hash-join.png\';\\nimport Wcoj4CliqueImage from \'./wcoj-4-clique.png\';\\n\\n\\nJoins of a sets of records is objectively the most expensive operation in DBMSs.\\nIn my previous post on [factorization](../2023-01-20-factorization/index.md), I said that in the field of databases, once \\nin a while you run into a very simple idea that deviates from the norm that gets you very excited. \\nToday, I will discuss another such idea, worst-case optimal join (wcoj) algorithms. \\nWcoj algorithms and the theory around it in one sentence says this:\\n\\n - Queries involving complex \\"cyclic joins\\" over many-to-many relationships should be \\n evaluated column at a time instead of table at a time, which is the norm. \\n\\n\\nWcoj algorithms find their best applications when finding cyclic patterns on graphs, \\nsuch as cliques or cycles, which is common in the workloads of fraud detection and\\nrecommendation applications. As such, they should be integrated into every graph DBMS \\n(and possibly to RDBMSs) and I am convinced that they eventually will.\\n\\n\x3c!--truncate--\x3e\\n\\n:::tip Tldr: The key takeaways are:\\n- **History of Wcoj Algorithms:** Research on wcoj algorithms started with a solution to open question \\n about the maximum sizes of join queries. This result made researchers realize this: the traditional \\n \\"binary join plans\\" paradigm of generating query plans that join 2 tables a time\\n until all of the tables in the query are joined is provably\\n suboptimal for some queries. Specifically, when join queries are\\n cyclic, which in graph terms means when the searched graph pattern has cycles\\n in it, and the relationships between records are many-to-many, then this \\n paradigm can generate unnecessarily large amounts of intermediate results.\\n- **Core Algorithmic Step of Wcoj Algorithms:** Wcoj algorithms fix this sub-optimality by \\n performing the joins one column at a time (instead of 2 tables at a time) using multiway intersections.\\n- **How K\xf9zu Integrates Wcoj Algorithms:** K\xf9zu generates plans that seamlessly mix binary joins \\n and wcoj-style multiway intersections. Multiway intersections are performed by an operator called \\n \\"multiway HashJoin\\", which has one or more build phases that creates one or more hash tables that stores\\n sorted adjacency lists; and a probe phase that performs multi-way intersections using the sorted lists.\\n- **Yes, the Term \\"Worst-case Optimal\\" Is Confusing Even to Don Knuth:** I know, Don Knuth also found the term\\n \\"worst-case optimal\\" a bit confusing. See my [anecdote on this](#a-thank-you--an-anecdote-about-knuths-reaction-to-the-term-worst-case-optimal). \\n It basically means that the worst-case runtimes of these algorithms are asymptotically optimal.\\n:::\\n\\n## Joins, Running Example & Traditional Table-at-a-time Joins\\nJoins are objectively the most expensive and powerful operation in DBMSs.\\nIn SQL, you indicate them in the FROM clause by listing\\na set of table names, in Cypher in the MATCH clause, where you draw a graph pattern\\nto describe how to join node records with each other.\\nAs a running example, consider a simple social network of users and followers, \\nwhose node-link diagram is shown below. I am also showing the table that contains these records \\nin a `User` (ignore the `name` property for now) and `Follows` tables.\\n\\n\\n\\n
\\n\\n
\\n\\nConsider finding triangles, which is one of the simplest \\nforms of cycles and cliques, in this network. The SQL and Cypher \\nversions of this query are shown below. \\n\\n```\\nSQL:\\nSELECT *\\nFROM Follows f1, Follows f2, Follows f3\\nWHERE f1.dst=f2.src AND f2.dst=f3.src AND\\n f3.dst = f1.src\\n\\nCypher:\\nMATCH (a:User)-[f1:Follows]->(b:User)-[f2:Follows]->(c:User)-[f3:Follows]->(a)\\nRETURN *\\n```\\nThat long MATCH clause \\"draws\\" a triangle and for our case here, this is equivalent\\nto joining three copies of the Follows table. \\n\\nNow ever since the System R days and [Patricia Selinger\'s 1979 seminal paper](https://courses.cs.duke.edu/compsci516/cps216/spring03/papers/selinger-etal-1979.pdf) that \\ndescribed how System R compiled and optimized SQL queries, there has been an \\nunchallenged dogma in DBMSs that the joins specified in the query would be \\nevaluated pairwise, table at a time. \\nHere\'s a blurb from Selinger\'s paper, where one can see this \\nassumption: \\n\\"*In System R a user need not know how the\\ntuples are physically stored ... Nor does a user \\nspecify in what order joins are to be performed. The System\\nR optimizer chooses both join order and ...*\\"\\nTo this day, this is the norm. DBMSs pick a \\"join order\\" which is the order in \\nwhich the tables should be joined iteratively 2 at a time. \\nIn the above example, for example \\nthere are three possible join orders. One way to represent these orders is by \\nwriting different parenthesization of the joins: \\n- (i) $((F1 \\\\bowtie F2) \\\\bowtie F3)$; (ii) $(F1 \\\\bowtie (F2 \\\\bowtie F3))$; \\n and (iii) $((F1 \\\\bowtie F3) \\\\bowtie F2)$. \\n\\nThe optimization problem for a system is of course more complex than just \\nordering tables because the system also has to choose which\\nbinary join algorithm to use when joining each pair of tables, e.g., hash joins vs merge joins. \\nBut take any system you want, and they will all follow the same paradigm of \\njoining 2 base or intermediate tables iteratively, until all tables are joined: \\nhence the term *binary joins* to describe the plans of existing systems.\\n\\n\\n## A Math Puzzle That Started it All \\n\\nSo, what\'s the problem with binary join plans? When join queries are cyclic\\nand the relationships are many-to-many, they can generate provably large amounts\\nof (so unnecessary in a formal sense) intermediate results. First, cyclicity for\\njoin queries has formal (and a bit intimidating) definitions but if you think of\\ngraph patterns, it simply means that the searched pattern\'s undirected version has\\ncycles. Why do binary joins generate unnecessarily large intermediate results? I\'ll\\nget to this below but first a bit of history on the origins of this insight.\\nThe whole topic of \\"worst-case optimal joins\\" started with 2 papers, a [2007 SODA](https://arxiv.org/abs/1711.04506) \\nand a [2008 FOCS](https://arxiv.org/abs/1711.03860) \\npaper, which are top venues in algorithms and theory. In these papers,\\nseveral theoreticians solved a fundamental open question \\nabout join queries. Suppose I give you:\\n\\n1. An arbitrary natural join query, say of $m$ relations. In DBMS literature we denote such \\n queries as $Q=R1(a_{11}, ..., a_{r1}) \\\\bowtie ... \\\\bowtie Rm(a_{m1}, ..., a_{rm})$.\\n2. Sizes of R1, ..., Rm, e.g., for simplicity assume they all have $IN$ many tuples. \\n\\n\\"Natural\\" here means that the join predicates are equality predicates on identical column \\nnames. You, as the second person in this puzzle, are allowed to set the values inside these relations. \\n**The open question was: how large can you make the final output?** So for example, if I told you that there are\\n$IN$ many tuples in the `Follows` tables, what is the maximum number of triangle outputs there can be?[^1]\\nEven more concretely for the triangle query, the question is: out of all possible graphs with $IN$ many edges, \\nwhat is the maximum number of triangles they contain?\\n\\n
\\n\\n
\\n\\nIt still surprises me that the answer to this question was not known until 2008.\\nIt just looks like a fundamental question someone in databases must have answered before. \\nNow excuse me for bombarding your brains with some necessary math definitions.\\nThese two papers showed that the answer is: $IN^{\\\\rho^*}$, where $\\\\rho^*$ is a property \\nof $Q$ called the *fractional edge cover number* of $Q$. \\nThis is the solution to\\nan optimization problem and best explained by thinking about the \\"join query graph\\",\\nwhich, for our purposes, is the triangle graph pattern (ignoring the edge directions), shown\\nin Fig 2a and 2b.\\n\\nThe optimization problem is this: \\nput a weight between [0, 1] to\\neach \\"query edge\\" such that each \\"query node\\" is \\"covered\\", i.e., the sum of\\nthe query edges touching each query node is > 1. Each such solution is called an\\nedge cover. The problem is to find the edge cover whose total weight is the minimum. That is \\ncalled the fractional edge cover number of the query. For the triangle query, \\none edge cover, shown in Fig 2a, is [1, 1, 0], which has\\na total weight of 1 + 1 + 0 = 2. \\nThe minimum weight edge cover is [1/2, 1/2, 1/2], shown in Fig 2b, \\nwith a total weight of 1.5. Therefore, the fractional edge cover number $\\\\rho^*$\\nof the triangle query is 1.5.\\nIn general, each edge cover is an upper bound but the FOCS paper showed\\nthat the fractional edge cover number is the tight upper bound.\\nSo the maximum number of triangles there can be on a graph with $IN$ edges is $\\\\Theta(IN^{1.5})$ \\nand this is tight, i.e., there are such graphs. Nice scientific progress!\\nNowadays, the quantity $IN^{\\\\rho^*}$ is known as the `AGM bound` of a query,\\nafter the first letters of the last names of the authors of the FOCS paper.\\n\\n\\n## Problem With Table-at-a-time/Binary Joins\\nNow this immediately made the same researchers realize that binary join plans are \\nprovably sub-optimal because they can generate polynomially more intermediate results\\nthan the AGM bound of the query. This happens because on cyclic queries, \\nthe strategy of joining tables\\n2 at a time may lead to unnecessarily computing some acyclic sub-joins. \\nFor example, in the triangle query, the plan\\n$((F1 \\\\bowtie F2) \\\\bowtie F3)$ first computes $(F1 \\\\bowtie F2)$ sub-join,\\nwhich in graph terms computes the 2-paths in the graph.\\nThis is a problem because often there can be many more of these acyclic sub-joins\\nthan there can be outputs for the cyclic join. \\nFor this plan, there can\\nbe $IN^2$ many 2-paths (which is the AGM bound of 2-paths),\\nwhich is polynomially larger than $IN^{1.5}$. \\nFor example in our running example, there are 1000\\\\*1000 = 1M many 2 paths,\\nbut on a graph with 2001 edges there can be at most 89.5K triangles (well ours\\nhas only 3 triangles (because the triangle query we are using is symmetric \\nthe sole triangle would generate 3 outputs for 3 rotations of it)).\\n \\nAny other plan in this case would have generated $IN^2$ many 2-paths, \\nso there is no good binary join plan here. I want to emphasize that this sub-optimality does not occur \\nwhen the queries are acyclic or when the dataset does not have \\nmany-to-many relationships. If the joins were primary-foreign key non-growing joins, \\nthen binary join plans will work just fine. \\n\\n## Solution: Column-at-a-time \\"Worst-case Optimal\\" Join Algorithms\\n\\nSo the immediate\\nnext question is: are there algorithms whose runtimes can be bounded by \\n$O(IN^{1.5})$? If so, how are they different? The answer to this question\\nis a bit anti-climactic. The core idea existed in the 2007 SODA and 2008 FOCS papers,\\nthough it was refined more ~4 years later in some theoretical papers\\nby [Hung Ngo](https://hung-q-ngo.github.io/), [Ely Porat](https://u.cs.biu.ac.il/~porat/), \\n[Chris R\xe9](https://cs.stanford.edu/~chrismre/), and [Atri Rudra](https://cse.buffalo.edu/faculty/atri/) \\nin the database fields [PODS](https://dl.acm.org/doi/10.1145/2213556.2213565) and \\n[SIGMOD Record](https://dl.acm.org/doi/10.1145/2590989.2590991). The answer is simply\\nto perform the join column at a time, using multiway \\nintersections. \\"Intersections of what?\\" you should be asking. \\nFor joins over arbtrary relations, we need special indices but I want to\\nskip this detail.\\nIn the context of GDBMSs, GDBMSs already\\nhave join indices (aka adjacency list indices) and for the common joins\\nthey perform, this will be enough for our purposes.\\n\\nI will next demonstrate a wcoj \\nalgorithm known as \\"Generic Join\\" from the [SIGMOD Record paper](https://dl.acm.org/doi/10.1145/2590989.2590991). \\nIt can be seen as the simplest of all wcoj algorithms.\\nAs \\"join order\\", we will pick a \\"column order\\"\\ninstead of Selinger-style table order. So in our triangle query,\\nthe order could be a,b,c. Then we will build indices over each relation\\nthat is consistent with this order. In our case there are conceptually three (identical)\\nrelations: `Follows1(a, b)`, `Follows2(b, c)`, `Follows3(c, a)`. For `Follows1`,\\nwe need to be able to read all `b` values for a given `a` value (e.g., `a=5`).\\nIn graph terms, this just means that we need \\"forward join index\\".\\nFor `Follows3`, because `a` comes earlier than `c`, we will want an index\\nthat gives us `c` values for a given `a` value. This is equivalent to a\\n\\"backward join index\\". In graphs, because joins happen through the\\nrelationship records, which can, for the purpose of the joins, \\nbe taught of as a binary relation (src, dst), 2 indices is enough\\nfor our purposes. On general relations, one may need many more indices.\\n\\n
\\n\\n
\\n\\n\\nWe will iteratively find: (i) all `a` values\\nthat can be in the final triangles; (ii) all `ab`\'s that be in the final\\ntriangles; and (iii) all `abc`\'s, which are the triangles. Let\'s simulate the computation:\\n - Step 1: Find all `a`\'s. Here we will just take\\nall nodes as possible a values. This is shown under \\"Step 1\\" in the above figure.\\n- Step 2: For each a value, e.g., a=1, we extend it to find all `ab`\'s that \\ncan be part of triangles: Here we use the forward index to look up all\\n`b` values for node with ID 1. So on and so forth. This will generate the \\nsecond intermediate relation.\\n- Step 3: For each `ab` value, e.g., the tuple (a=1 b=0), we will\\nintersect all `c`\'s with `a`=1, and all `c`\'s with `b`=0. That is, we will intersect\\nthe backward adjacency list of the node with ID 1, and forward adjacency list of \\nthe node with ID 0. If the intersection is non-empty, we produce some triangles.\\nIn this case, we will produce the triangle (`a`=1, `b`=0, `c`=1001)\\nThe result of this computation will produce the third and final \\noutput table in the figure.\\n\\n\\nNote that this process did not produce the 2-paths as an intermediate step, \\nwhich is how wcoj algorithms fix for the sub-optimality of binary join algorithms.\\nIf your query was more complex then a wcoj algorithm can do k-way intersections where k > 2. \\nFor example on the 4-clique query shown on the right, suppose the \\ncolumn order is abcd, then given abc triangles, we would do a 3-way intersection of\\nforward index of a\'s, backward index of b\'s, and forward index of c\'s, to complete\\nthe triangles to joins. This type of multiway intersections is the necessary \\nalgorithmic step to be efficient on cyclic queries.\\n\\n\\n## How K\xf9zu Performs Worst-case Optimal Join Algorithms:\\n\\nOur [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) describes this in detail, so I will be brief here. \\nFirst, K\xf9zu mixes binary joins and wcoj-like multiway intersections\\nfollowing some principles that my PhD student [Amine Mhedhbi](http://amine.io/)\\nhad worked quite hard on early in his PhD. I recommend these two papers, \\none by [Amine and me](https://www.vldb.org/pvldb/vol12/p1692-mhedhbi.pdf)\\nand one by the [Umbra group](https://db.in.tum.de/~freitag/papers/p1891-freitag.pdf) \\non several different ways people have proposed for mixing binary and wcoj algorithms in query plans. \\nOverall message of these studies is that, wcoj are critical when the query has a very cyclic component\\nand multiway intersections can help. If the query does not have this property, \\nsystems should just use binary joins. \\nSo wcoj-like computations should be seen as complementing binary join plans.\\n\\n
\\n\\n
\\n\\n\\n\\nSecond, K\xf9zu performs multiway intersections in a *Multiway HashJoin* operator.\\nIn our CIDR paper we call this operator Multiway ASPJoin. It can\xa0be thought \\nof a modified hash-join operator where we use multiple hash tables and do \\nan intersection to produce outputs as I will simulate. \\nLet me change the query a little and add a filter on `a.name = Noura`,\\nwhere `name` is the primary key of `User` records. You can see from Fig 1a\\nthat Noura is the primary key of node with ID 1. In my simulation,\\nthe Multiway HashJoin operator will take `ab` tuples and extend them \\nto `abc` tuples through a 2-way intersection. In general multiway HashJoin\\nhas 3 phases: 1 accumulate phase, build phases to build k-2 hash tables, \\nand a probe phase. Here are the steps.\\n- Step 1 - Accumulate Phase: The operator receives the `ab` tuples which will be extended\\nto triangles. This allows the system to see exactly\\nthe forward/backward lists of which nodes will be intersected. Then, the operator passes \\nthis information sideways to only scan those lists. In this case,\\nbecause there is a primary key filter on Noura, the only `ab` tuple that will be read\\nis (a=1,b=0). This is stored in a temporary buffer that we call \\"Factorized Table\\" in the system.\\n- Step 2 - Build Phase 1: In the first build step, Multway HashJoin will pass a nodeID filter\\nto the `Scan Follows (a)<-(c)` operator with only 1=true for node ID 1, and 0 for every other node ID.\\nThe operator can do this because at this stage the operator knows exactly which backward\\nadjacency lists will be needed when we extend the tuple (in this case only node with ID 1\'s\\nbackward list is needed). The Scan operator uses this node ID filter to scan only this backward list, \\n{1001}, and avoids\\nscanning the rest of the file that stores the backwards Follows edges. This list is first sorted\\nbased on the IDs of the neighbor IDs and stored in a hash table, denoted as \\"Hash Table (a)<-(c)\\"\\nin the figure.\\n- Step 3 - Build Phase 2: This is similar to Build phase 1. Using a semijoin filter\\nwith node 0\'s ID, we scan only node 2\'s forward `Follows` list {1001, 1002, ..., 2000}, \\nsort it, and then store in a hash table \\"Hash Table (b)->(c)\\".\\n- Step 4 - Probe: We re-scan the accumulated `ab` tuples from the factorized table.\\nFor each tuple, we first probe \\"Hash Table (a)<-(c)\\" \\nand then \\"Hash Table (b)->(c)\\" to fetch two lists, intersect them, and produce outputs.\\nIn this case there is only one tuple (a=1, b=0), so we will fetch a=1\'s backward list and b=0\'s forward list,\\nintersect these lists, and produce the triangle (a=1, b=0, c=1001).\\n\\nThis performs quite well. Our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) has some performance numbers\\ncomparing against other types of WCO joins implementations (see the experiments in Table 3). Since I did not cover other ways to implement\\nwco join algorithms inside DBMSs, these experiments would be difficult to explain here.\\nInstead, let me just demonstrate some simple comparisons between using binary joins and wco joins\\nin K\xf9zu on a simple triangle query. On larger cyclic queries, e.g., 4- or 5- cliques, \\nthe differences are much larger and often binary join plans do not finish on time.\\nYou can try this experiment too. \\n\\nHere is the configuration. The dataset I\'m using\\nis a popular web graph that is used in academic papers called [web-BerkStan](https://snap.stanford.edu/data/web-BerkStan.html).\\nIt has 685K nodes and 7.6M edges.\\nI modeled these as a simple `Page` nodes and `Links` edges.\\n\\nI start K\xf9zu on my own laptop, which is a Macbook Air 2020 with Apple M1 chip, 16G memory,\\nand 512GB SSD, and run the following two queries (by default, K\xf9zu uses all thread available, which is 8 in this case):\\n\\n```\\n- Q1: K\xf9zu-WCO\\nMATCH (a:Page)-[e1:Links]->(b:Page)-[e2:Links]->(c:Page)-[e3:Links]->(a)\\nRETURN count(*)\\n```\\nThis will compile plan that uses a wco Multiway HashJoin operator. I will refer to this\\nplan as K\xf9zu-WCO below. I am also running the following query:\\n```\\n- Q2: K\xf9zu-BJ\\nMATCH (a:Page)-[e1:Links]->(b:Page)\\nWITH *\\nMATCH (b:Page)-[e2:Links]->(c:Page)\\nWIH *\\nMATCH (c)-[e3:Links]->(a)\\nRETURN count(*)\\n```\\n\\nCurrently K\xf9zu compiles each MATCH/WITH block separately so this is hack to force the system\\nto use binary join plan. The plan will join `e1` `Links` with `e2` `Links` and then\\njoin the result of that with `e3` `Links`, all using binary HashJoin operator. I will\\nrefer to this as K\xf9zu-BJ. Here are the results:\\n\\n| Configuration | Time |\\n|----------|:-------------:|\\n| K\xf9zu-WCO | 1.62s |\\n| K\xf9zu-BJ | 51.17s |\\n\\nThere are ~41M triangles in the output. We see **31.6x** performance improvement in this simple query. \\nIn larger densely cyclic queries, binary join plans just don\'t work.\\n\\nTo try this locally, you can download our prepared CSV files from [here](https://github.com/kuzudb/kuzudb.github.io/tree/main/data/web-berkstan), and compile from our [latest master](https://github.com/kuzudb/kuzu)[^2] (`make clean && make release NUM_THREADS=8`).\\nThen start K\xf9zu\'s shell, and load data into K\xf9zu:\\n```\\n./build/release/tools/shell/kuzu_shell -i web.db\\nkuzu> CREATE NODE TABLE Page (id INT64, PRIMARY KEY(INT64));\\nkuzu> CREATE REL TABLE Links (FROM Page TO Page, MANY_MANY);\\nkuzu> COPY Page FROM \'web-node.csv\';\\nkuzu> COPY Links FROM \'web-edge.csv\';\\n```\\nNow, run those two queries (K\xf9zu-WCO and K\xf9zu-BJ) to see the difference!\\n\\n## A Thank You & an Anecdote About Knuth\'s Reaction to the Term \\"Worst-case Optimal\\"\\n \\nBefore wrapping up, I want to say thank you to [Chris R\xe9](https://cs.stanford.edu/~chrismre/), who is a\\nco-inventor of earliest wcoj algorithms. \\nIn the 5th year of my PhD, Chris had introduced me to this area and \\nwe had written a paper together on the topic in the context of evaluating\\njoins in distributed systems, such as MapReduce and Spark. I ended up working on\\nthese algorithms and trying to make them performant in actual systems\\nfor many more years than I initially predicted. \\nI also want to say thank you to [Hung Ngo](https://hung-q-ngo.github.io/) and [Atri Rudra](https://cse.buffalo.edu/faculty/atri/),\\nwith whom I have had several conversations during those years on these algorithms.\\n\\nFinally, let me end with a fun story about the term \\"worst-case optimal\\": \\nSeveral years ago [Don Knuth](https://uwaterloo.ca/computer-science/events/dls-donald-knuth-all-questions-answered) was visiting UWaterloo\\nto give a Distinguished Lecture Seminar, which is our department\'s most prestigious \\nlecture series. A colleague of mine and I had a 1-1 meeting with him. \\nKnuth must be known to anyone with a CS degree but importantly he is\\ncredited for founding the field of algorithm analysis (e.g., for popularizing\\nthe big-oh notation for analyzing algorithms\' performances). \\nIn our meeting, he asked me what I was working on\\nand I told him about these new algorithms called \\"worst-case optimal join algorithms\\".\\nThe term was so confusing to him and his immediate interpretation \\nwas: \\"Are they so good that they are optimal even in their worst-case performances?\\" \\n\\nThe term actually means that the worst-case runtime of these algorithms\\nmeets a known lower bound for the worst-case runtime of any join algorithm,\\nwhich is $\\\\Omega(IN^{\\\\rho^*})$.\\nProbably a more standard term would be to call them \\n\\"asymptotically optimal\\", just like people call sort merge an asymptotically optimal \\nsorting algorithm under the comparison model.\\n\\n\\n## Final Words\\nWhat other fundamental algorithmic developments have\\nbeen made in the field on join processing? It is surprising but there are still main gaps\\nin the field\'s understanding of how fast joins can be processed. \\nThere has been some very interesting \\nwork in an area called *beyond worst-case optimal join algorithms*. These papers\\nask very fundamental questions about joins, such as how can we prove that a join algorithm\\nis correct, i.e., it produces the correct output given its input? \\nThe high-level answer is that each join algorithm must be producing a proof that its output is correct,\\nthrough the comparison operations it makes.\\nThe goal of this line of research is to design practical algorithms whose implicit proofs are optimal,\\ni.e., as small as possible. This is \\nprobably the most ambitious level of optimality one can go for in algorithm design.\\nThere are already some algorithms, e.g., an algorithm called [Tetris](https://dl.acm.org/doi/pdf/10.1145/2967101). The area\\nis fascinating and has deep connections to computational geometry. I\\nadvised a [Master\'s thesis](https://arxiv.org/abs/1909.12102) on the topic once and learned quite a bit about\\ncomputational geometry that I never thought could be relevant to my work. The current\\nbeyond worst-case optimal join algorithms however are currently not practical. \\nSome brave souls need to get into the space and think hard about whether \\npractical versions of these algorithms can be developed. That would be very exciting.\\n\\nThis completes my 3-part blog on the contents of our CIDR paper and 2 core techniques:\\n[factorization](../2023-01-20-factorization/index.md) and worst-case optimal join algorithms that we have integrated into\\nK\xf9zu to optimize for many-to-many joins. My goal in these blog\\nposts was to explain these ideas to a general CS/software engineering audience and\\nI hope these posts have made this material more approachable. My other goal\\nwas to show the role of theory in advancing systems. Both of these ideas emerged from\\npen-and-paper theory papers that theoreticians wrote but gave clear advice to DBMS developers.\\nAs I said many times, I\'m convinced that among many other techniques, these two \\ntechniques need to be integral to any GDBMS that wants to be competitive in performance,\\nbecause queries with many-to-many joins are first-class-citizens in the workloads of these systems.\\n\\nWe will keep writing more blog posts in the later months about our new releases,\\nand other technical topics. If there are things you\'d like us to write about,\\nplease reach out to us! Also please give K\xf9zu a try, prototype applications with it,\\nbreak it, let us know of your performance or other bugs, so we can continue improving\\nit. Give us a [GitHub star](https://github.com/kuzudb/kuzu) too and take care until the next posts!\\n\\n\\n[^1]: The question is interesting in the set semantics when you cannot pick every column value of every tuple the same value, which forces a Cartesian product of all the relations.\\n[^2]: We found a minor bug in the latest release 0.0.2 when a node has a very large number of edges, which is fixed in the master branch, that\'s why we suggest using the master branch."},{"id":"kuzu-0.0.2-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.2-release","source":"@site/blog/2023-02-13-kuzu-v-0.0.2.md","title":"K\xf9zu 0.0.2 Release","description":"This post is about the second release of K\xf9zu. However, we want to start with something much more important:","date":"2023-02-13T00:00:00.000Z","formattedDate":"February 13, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":6.33,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.2-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"Why (Graph) DBMSs Need New Join Algorithms: The Story of Worst-case Optimal Join Algorithms","permalink":"/docusaurus/blog/wcoj"},"nextItem":{"title":"Factorization & Great Ideas from Database Theory","permalink":"/docusaurus/blog/factorization"}},"content":"This post is about the second release of K\xf9zu. However, we want to start with something much more important:\\n\\n### Donate to the Victims of [T\xfcrkiye-Syria Earthquake](https://www.bbc.com/news/world-middle-east-64590946):\\nOur hearts, thoughts, and prayers go to all the victims, those who survived and those who passed,\\nin Syria and T\xfcrkiye. \\nThere will be a very difficult winter for all those who survived so everyone needs to help. \\nHere are two pointers for trustworthy organizations we know of that are trying to help\\nvictims on the ground. For T\xfcrkiye (where Semih is from), you can donate to [Ahbap](https://ahbap.org/bagis-kategorisi/5)\\n(Please be aware that **the donation currency is in TL** and 14 TL = 1 CAD; 19TL = 1 USD); and for Syria \\nyou can donate to the [White Helmets](https://www.whitehelmets.org/en/). Be generous! We\'ll leave pointers to several \\nother organizations below in this footnote[^1].\\n\\n\x3c!--truncate--\x3e\\n\\n## Overview of K\xf9zu 0.0.2\\nBack to our release. K\xf9zu codebase is changing fast but this release still has a focus: we \\nhave worked quite hard since the last release to integrate K\xf9zu to import data from\\ndifferent formats and export data to different formats. There are also several important \\nfeatures in the new Cypher clauses and queries we support, additional string \\nprocessing capabilities, and new DDL statement support. We will give a summary of each \\nof these below.\\n\\nFor installing the new version, please visit the [installation guide](https://kuzudb.com/docs/getting-started.html) and\\nthe full\\n[release notes are here](https://github.com/kuzudb/kuzu/releases). If you are eager to play with\\na few Colab notebooks, here are several links: \\n- [General K\xf9zu Demo](https://colab.research.google.com/drive/15OLPggnRSBmR_K9yzq6iAGE5MDzNwqoN)\\n- [Export Query Results to Pytorch Geometric: Node Property Prediction Example](https://colab.research.google.com/drive/1fzcwBwTY-M19p7OOTIaynfgHFcAQo9NK)\\n- [Export Query Results to Pytorch Geometric: Link Prediction Example](https://colab.research.google.com/drive/1QdX7CDdajIAb04lqaO5PfJlpKG-ljG28)\\n- [Export Query Results to NetworkX](https://colab.research.google.com/drive/1NDsnFDWcSGoaOl-mOgG0zrPG2VAr8Q6H)\\n\\n## Exporting Query Results to Pytorch Geometric and NetworkX\\nPerhaps most excitingly, we have added the first capabilities to integrate with 2 popular \\ngraph data science\\nlibraries: (i) [Pytorch Geometric](https://github.com/pyg-team/pytorch_geometric) (PyG) for performing \\ngraph machine learning; and (ii) [NetworkX](https://networkx.org/) for a variety of \\ngraph analytics, including visualization. \\n\\n### Pytorch Geometric: `QueryResult.get_as_torch_geometric()` function\\nOur [Python API](https://kuzudb.com/api-docs/python/) now has a \\nnew [`QueryResult.get_as_torch_geometric()`](https://kuzudb.com/api-docs/python/kuzu/query_result.html#QueryResult.get_as_torch_geometric) function that \\nconverts results of queries to PyG\'s in-memory graph representation \\n[`torch_geometric.data`](https://pytorch-geometric.readthedocs.io/en/latest/modules/data.html).\\nIf your query results contains nodes and relationship objects, then the function uses \\nthose nodes and relationships to construct either `torch_geometric.data.Data` or \\n`torch_geometric.data.HeteroData` objects. The function also auto-converts any numeric or boolean property \\non the nodes into tensors on the nodes that can be used as features in the `Data/HeteroData` objects.\\nAny property that cannot be auto-converted and the edge properties are also returned in case you need\\nwant to manually put them into the `Data/HeteroData` objects.\\n\\n**Colab Demonstrations:**\\nHere are 2 Colab notebooks that you can play around with to see how you can develop graph learning\\npipelines using K\xf9zu as your GDBMSs:\\n1. [Node property prediction](https://colab.research.google.com/drive/1fzcwBwTY-M19p7OOTIaynfgHFcAQo9NK)\\n2. [Link prediction](https://colab.research.google.com/drive/1QdX7CDdajIAb04lqaO5PfJlpKG-ljG28)\\n\\nThe examples demonstrate how to extract a subgraph,\\ntrain graph convolutional or neural networks (GCNs or GNNs), make some node property\\nor link predictions and save them back in K\xf9zu so you can query these predictions.\\n\\n### NetworkX: `QueryResult.get_as_networkx()` function\\nOur [Python API](https://kuzudb.com/docs/client-apis/python-api/overview.html) now has a \\nnew [`QueryResult.get_as_networkx()`](https://kuzudb.com/api-docs/python/kuzu/query_result.html#QueryResult.get_as_networkx) function that can convert query results\\nthat contain nodes and relationships into NetworkX directed or undirected graphs. Using this function, you can build pipelines\\nthat benefits from K\xf9zu\'s DBMS functionalities (e.g., querying, data extraction and transformations,\\nusing a high-level query language with very fast performance), and NetworkX\'s rich library of \\ngraph analytics algorithms.\\n\\n**Colab Demonstration:**\\nHere is a [Colab notebook](https://colab.research.google.com/drive/1NDsnFDWcSGoaOl-mOgG0zrPG2VAr8Q6H?usp=sharing#scrollTo=AkpBul7ZpUM5) \\nthat you can play around with that shows how to do basic graph visualization of query results\\nand build a pipeline that computes PageRanks of a subgraph and store those PageRank \\nvalues back as new node properties in K\xf9zu and query them.\\n\\n## Data Import from and Export to Parquet and Arrow\\nWe have removed our own CSV reader and instead now use [Arrow](https://arrow.apache.org/)\\nas our default library when bulk importing data through [`COPY FROM` statements](https://kuzudb.com/docs/data-import/csv-import.html). \\nUsing Arrow, we can not only bulk import\\nfrom CSV files but also from arrow IPC and parquet files. We detect the file type\\nfrom the suffix of the file; so if the query says `COPY user FROM ./user.parquet`,\\nwe infer that this is a parquet file and parse it so. See the details [here](/docusaurus/data-import/parquet-import).\\n\\n## Multi-labeled or Unlabeled Queries\\nA very useful feature of the query languages of GDBMSs is their\\nability to elegantly express unions of join queries. \\nWe had written about this feature of GDBMSs in this blog post about \\n[What Every Competent GDBMS Should Do](./2023-01-12-what-every-gdbms-should-do/index.md)\\n(see the last paragraph of Section `Feature 4: Schema Querying`).\\nIn Cypher, a good example\\nof this is to not bind the node and relationship variables to a specific node/relationship\\nlabels/tables. Consider this query:\\n```\\nMATCH (a:User)-[e]->(b)\\nWHERE a.name = \'Karissa\'\\nRETURN a, e, b\\n```\\nThis query asks for all types of relationships that Karissa can have to any possible other\\nnode (not necessarily of label `User`) in the query. So if the database contains \\n`Likes` relationships from `Users` to `Comments`, `Follows` relationships\\nfrom `Users` to `Users`, and `LivesIn` relationships from `Users` and `Cities`, \\nvariables e and b can bind to records from all of these\\nrelationship and node labels, respectively. \\n\\nYou can also restrict the labels of nodes/rels to a fixed set that contains\\nmore than one label using the `|` syntax.\\nFor example you can do:\\n\\n```\\nMATCH (a:User)-[e:Likes|Follows]->(b)\\nWHERE a.name = \'Karissa\'\\nRETURN a, e, b\\n```\\nThis forces e to match to only Likes relationship or Follows relationship records (so\\nexcludes the `LivesIn` records we mentioned above). The `|` is a syntax adapted from\\nregexes originally and is also used in query languages that support `regular path queries`. \\n\\nK\xf9zu now supports such queries. Our query execution\\nis based on performing scans of each possible node/rel table and index\\nand when a variable `x` can bind to multiple node/rel tables, `L1, L2, ..., Lk`,\\nwe reserve one vector for each possible property of each node/rel table. \\nIf anyone has any optimizations to do something smarter, it would be very interesting\\nto hear!\\n\\n## Other Important Changes\\n\\n### Enhanced String Features\\nWe\'ve added two important features to enhance K\xf9zu\'s ability to store and process strings:\\n\\n1) Support of UTF-8 characters. With the help of [utf8proc](https://github.com/JuliaStrings/utf8proc), you can now store string node/relationship\\n properties in K\xf9zu that has UTF-8 characters;\\n2) Support of [regex pattern matching](/docusaurus/cypher/expressions/pattern-matching) with strings. K\xf9zu now supports Cypher\'s `=~` operator for regex searches, which will return true if its pattern mathces the entire input string. For example: `RETURN \'abc\' =~ \'.*(b|d).*\';`.\\n\\n### CASE Expression\\nWe\'ve added [CASE](/docusaurus/cypher/expressions/case-expression) for conditional expressions.\\nTwo forms ([Simple Form](/docusaurus/cypher/expressions/case-expression#simple-form) and [General Form](/docusaurus/cypher/expressions/case-expression#general-form)) of CASE expression are supported.\\n\\n### ALTER/DROP/SET/DELETE\\nWe added [ALTER TABLE](/cypher/data-definition/alter) and [DROP TABLE](/cypher/data-definition/drop) DDL statements.\\nAfter creating a new node or relationship table, you can now drop it, rename it, and alter it by adding new columns/properties, \\nrenaming or dropping existing columns/properties.\\n\\nBesides schema level changes, you can change properties of existing nodes/rels with [SET](/docusaurus/cypher/data-manipulation-clauses/set) statements, and remove existing nodes/rels with [DELETE](/docusaurus/cypher/data-manipulation-clauses/delete) statements.\\n\\n### Disable Relationships with Multiple Source or Destination Labels\\nWe now no longer support defining a relationship between multiple source or destination labels.\\nThis is to simplify our storage. But please let us know if you have strong use cases on this.\\n\\nEnjoy our new release and don\'t forget to donate to the earthquake victims.\\n\\n[^1]: For T\xfcrkiye two other organizations are [AFAD](https://en.afad.gov.tr/earthquake-campaign), which is the public institute for coordinating natural disaster response and [Akut](https://www.akut.org.tr/en/donation), a volunteer-based and highly organized search and rescue group. For Syria, another campaign I can recommend is [Molham Team](https://molhamteam.com/en/campaigns/439?fbclid=IwAR3_t443XME9Gh0r75KM4VpQ58WLNPd8w8tyMV2JprdObwecPwhWAdX2FOQ), which is an organization founded by Syrian refugee students."},{"id":"factorization","metadata":{"permalink":"/docusaurus/blog/factorization","source":"@site/blog/2023-01-20-factorization/index.md","title":"Factorization & Great Ideas from Database Theory","description":"Many of the core principles of how to develop DBMSs are well understood.","date":"2023-01-20T00:00:00.000Z","formattedDate":"January 20, 2023","tags":[{"label":"internals","permalink":"/docusaurus/blog/tags/internals"}],"readingTime":22.71,"hasTruncateMarker":true,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"factorization","authors":["semih"],"tags":["internals"]},"prevItem":{"title":"K\xf9zu 0.0.2 Release","permalink":"/docusaurus/blog/kuzu-0.0.2-release"},"nextItem":{"title":"What Every Competent GDBMS Should Do (aka The Goals & Vision of K\xf9zu","permalink":"/docusaurus/blog/what-every-gdbms-should-do-and-vision"}},"content":"import TwoHopDataImage from \'./2-hop-data.png\';\\nimport TwoHopQueryPlanHashJoinImage from \'./2-hop-query-plan-hash-join.png\';\\nimport TwoHopQueryPlanExtendImage from \'./2-hop-query-plan-extend.png\';\\nimport TwoHopFactorizationExperimentImage from \'./2-hop-factorization-experiment.png\';\\nimport FlatVsFactorizedImage from \'./flat-vs-factorized.png\';\\nimport FactorizedVectorsImage from \'./factorized-vectors.png\';\\nimport FactorizedExecutionSimulationImage from \'./factorized-execution-simulation.png\';\\n\\n\\nMany of the core principles of how to develop DBMSs are well understood.\\nFor example, a very good query compilation paradigm is to \\nmap high-level queries to a logical plan of relational operators, then optimize this plan,\\nand then further map it to an executable code often in the form of a physical query plan. \\nSimilarly, if you want updates to a DBMS to be atomic and durable,\\na good paradigm is to use a write-ahead log that serves as a source of truth\\nand can be used to undo or redo operations. Many systems adopt such common wisdom paradigms. \\nAs core DBMS researcher, once in a while however, you run into a very simple idea \\nthat deviates from the norm that gets you very excited. \\nToday, I want to write about one such idea called [factorization](https://www.cs.ox.ac.uk/dan.olteanu/papers/os-sigrec16.pdf). \\n\\n\x3c!--truncate--\x3e\\n\\n:::tip Tldr: The key takeaways are:\\n- **Overview of Factorization & Why Every GDBMS Must Adopt It**: Factorization\\n is a compression technique to compress the intermediate results\\n that query processors generate when evaluating many-to-many (m-n) joins. \\n Factorization can compress an intermediate result size exponentially \\n in the number m-n joins in the query.\\n- **Example Benefits of Factorization**: Benefits of keeping intermediate\\n results smaller reduces the computation processors perform \\n on many queries. Examples include reducing copies by keeping the output\\n data size small, reducing filter and expression evaluation computations exponentially,\\n and performing very fast aggregations.\\n- **How K\xf9zu Implements Factorization:** K\xf9zu\'s query processor\\n is designed to achieve 3 design goals: (i) factorize intermediate results;\\n (ii) always perform sequential scans of database files; and (iii) avoid\\n scanning large chunks of database files when possible. In addition, the processor is \\n vectorized as in modern columnar DBMSs. These design goals are achieved by passing \\n multiple *factorized vectors* between each other and using modified HashJoin operators \\n that do *sideways information passing* to avoid scans of entire files.\\n:::\\n\\nThis is a quite technical and long blog post and will appeal more to people who are interested\\nin internals of DBMSs. It\'s about a technique that\'s quite dear to my heart called factorization,\\nwhich is a very\\nsimple data compression technique. Probably all \\ncompression techniques you know are designed to compress database files that \\nare stored on disk. Think of run-length encoding, dictionary compression, or bitpacking.\\nIn contrast, you can\'t use factorization to compress your raw database files. \\nFactorization has a very unique property:\\nit is designed to compress the intermediate \\ndata that are generated when query processors of DBMSs evaluate \\nmany-to-many (m-n) growing joins. If you have read [my previous blog](../2023-01-12-what-every-gdbms-should-do/index.md),\\nefficiently handling m-n joins was one of the items on my list of properties \\nthat competent GDBMSs should excel in. This is because \\nthe workloads of GDBMSs commonly contain m-n joins\\nacross node records. Each user in a social network or an account in a financial transaction network\\nor will have thousands of connections and if you want\\na GDBMS to find patterns on your graphs, you are \\nasking queries with m-n joins. Factorization is directly designed\\nfor these workloads and because of that every competent GDBMS must develop \\na factorized query processor. In fact, if I were to try to write a new analytical RDBMS,\\nI would probably also integrate factorization into it.\\n\\nThis post forms the 2nd part of my 3-part posts on the contents of our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf)\\nwhere we introduced K\xf9zu. The 3rd piece will be on another technique called worst-case \\noptimal join algorithms, which is also designed for a specific class of m-n joins.\\nBoth in this post and the next, I have two goals. First is to try to articulate these techniques \\nusing a language that is accessible to general software engineers. \\nSecond, is to make people appreciate the role of \\npen-and-paper theory in advancing the field of DBMSs. Both of these techniques were first \\narticulated in a series of purely theoretical papers which gave excellent \\npractical advice on how to improve DBMS performance. \\nCredit goes to the great theoreticians who pioneered these techniques whom I will cite\\nin these posts. Their work should be highly appreciated.\\n\\n## A Quick Background: Traditional Query Processing Using Flat Tuples\\nHere is a short background on the basics of\\nquery processors before I explain factorization. If you know about \\nquery plans and how to interpret them,\\nyou can skip to [here](#factorization-in-a-nutshell) after reading\\nmy running example.\\nConsider a database of Account node and Transfer edge records below.\\nThe two Accounts with `accID` fields L1 and L2 are owned by Liz and \\neach have 100 incoming and 100 outgoing Transfer edges.\\n\\n
\\n\\n
\\n\\nNow consider a 2-hop path query in Cypher returning the accID\'s of source\\nand destinations of money flows Liz\'s accounts are facilitating:\\n\\n``` \\nMATCH (a:Account)-[t1:Transfer]->(b:Account)-[t2:Transfer]->(c:Account)\\nWHERE b.name = \'Liz\' \\nRETURN a.accID, c.accID\\n```\\n\\nHere\'s the SQL version of the query if you modeled your records as relations.\\nSame query different syntax:\\n```\\nSELECT a.accID, c.accID\\nFROM Account a, Transfer t1, Account b, Transfer t2, Account c\\nWHERE b.name = \'Liz\' AND\\n t1.src = a.accID AND t1.dst = b.accID AND\\n t2.src = b.accID AND t2.dst = c.accID\\n```\\n\\nA standard query plan for this query is shown on the left in Fig. 2. \\nThe plan contains some Scan operators to scan the raw node or edge records (edges could be \\nscanned from a join index) and some hash join operators to perform the joins, and \\na final projection operator.\\nIn some GDBMSs, you might see \\"linear plans\\" that look as in Fig. 3.\\n\\n

\\n \\n \\n

\\n\\nThe linear plan is from our previous GraphflowDB system. Here\\nyou are seeing an operator called Extend, which joins node records with their Transfer relationships to \\nread the system-level IDs of the neighbors of those node records. \\nFollowing the Extend is another Join operator to join the accID properties of those neighbors \\n(specifically c.accID and a.accID). \\nIn Neo4j, you\'ll instead see an Expand(All) operator, which does the Extend+Join\\nin GraphflowDB in a single operator[^1]. For very good reasons\\nwe removed these Extend/Expand type operators in K\xf9zu. I will come back to this.\\n\\nThe interpretation of plans is that tuples are flowing from the bottom to top and\\neach operator will take in sets of tuples and produce sets of tuples (in a pipelined fashion). \\nThe key motivation for factorization is that what flows \\nbetween operators are **flat tuples**. When the joins are m-n, this \\nleads to many data repetitions, which one way or another leads to repeated\\ncomputation in the operators. For example,\\nthe final projection operator in our example would take the table shown in Figure 4 (left).\\n
\\n\\n
\\n\\n\\nThere are 20K tuples in the flat representation because both L1 and L2 are part of \\n100 incoming x 100 outgoing=10K many 2-paths. Notice the many repetitions in this relation:\\nL1, L2, or Liz values, or the values in a.accID and c.accID. \\nWhat gets replicated may change across systems. Some may replicate the actual values,\\nsome may replicate indices where these values are stored but overall exactly 20K\\ntuples would be generated. This redundancy leads to redundant computation here and there\\nduring query processing.\\n\\n## Factorization In a Nutshell\\nFactorization addresses exactly this problem. The core reason for the redundancy\\nis this observation: *given a fixed b value, all a\'s and c\'s are conditionally independent*.\\nMore concretely, once b is bound to node L1, each incoming neighbor `a` for L1 will join \\nwith each outgoing neighbor `c` of L1. If you took the first standard undergraduate course in DBMSs at a university\\nand you covered the theory of normalization, this is what is \\ncalled a [multi-valued dependency](https://en.wikipedia.org/wiki/Multivalued_dependency)\\nin relations. Factorization exploits such dependencies to compress\\nrelations using Cartesian products.\\nAbove in Figure 4 (right),\\nI\'m showing the same 20K tuples in a factorized format using only 400 values\\n(so 2\\\\*(100+100) instead of 2\\\\*100\\\\*100 values). \\n\\nThat\'s it! That\'s the core of the idea! Now of course, this simple observation leads to a ton of \\nhard and non-obvious questions that the entire theory on factorization answers. For example, \\ngiven a query, what are the \\"factorization structures\\", i.e., the Cartesian product structures\\nthat can be used to compress it? Consider a simple query that counts the number of\\npaths that are slightly longer:\\n```\\nMATCH (a:Account)-[:Wire]->(b:Account)-[:Deposit]>(c:Account)-[:ETransfer]->(d:Account)\\nRETURN count(*)\\n```\\nShould you condition on b and factor out \\na\'s from (c, d)\'s or condition on c and factor out (a, b)\'s from d\'s? \\nOr you could condition on (b, c) and factor out (a)\'s from (d)\'s?\\nTo make a choice, a system has to reason about the number of Wire, Deposit,\\nand ETransfer records in the database.\\nHow much and on which queries can you benefit from factorization?\\nThe theoretical questions are endless. \\nThe theory of factorization develops the formal foundation so that such questions can be answered and \\nprovides principled first answers to these questions. \\n[Dan Olteanu](https://www.ifi.uzh.ch/en/dast/people/Olteanu.html) and his \\ncolleagues, who lead this field, recently won the [ICDT test of time award](https://databasetheory.org/ICDT/test-of-time)\\nfor their work on factorization. ICDT is one of the two main \\nacademic venues for theoretical work on DBMSs.\\n\\nBut let\'s take a step back and appreciate this theory because it gives an excellent \\nadvice to system developers: *factorize your intermediate\\nresults if your queries contain many-to-many joins!* \\nRecall that GDBMSs most commonly evaluate many-to-many joins. So hence my point that \\nGDBMSs should develop factorized query processors.\\nThe great thing this theory shows us is that this can all be done by static analysis of the query \\nduring compilation time by only inspecting the dependencies between variables in\\nthe query! I won\'t cover the exact rules but at least in my running example,\\nI hope it\'s clear that because there is no predicate between a\'s and c\'s, once\\nb is fixed, we can factor out a\'s from c\'s.\\n\\n## Examples When Factorization Significantly Benefits:\\nFactorized intermediate relations can be exponentially smaller\\n(in terms of the number of joins in the query)\\nthan their flat versions, which \\ncan yield orders of magnitude speed ups in query performance \\nfor many different reasons. I will discuss three most obvious ones.\\n\\n### Less Data Copies/Movement \\nThe most obvious benefit is that factorization reduces\\nthe amount of data copied between buffers used by operators\\nduring processing and to final `QueryResult` structure\\nthat the application gets access to. For example, a very cool feature of K\xf9zu \\nis that it keeps final outputs in factorized format in its `QueryResult` class and \\nenumerates them one by one only when the user starts calling `QueryResult::getNext()`\\nto read the tuples.\\nIn our running example, throughout processing K\xf9zu would do copies of\\n400 data values roughly instead of 20K to produce its `QueryResult`. \\nNeedless to say, I could have picked a more exaggerated query, say a \\"star\\" query\\nwith 6 relationships, and arbitrarily increased the difference in the copies done \\nbetween a flat vs factorized processor.\\n\\n### Fewer Predicate and Expression Evaluations\\nFactorization can also reduce the amount of predicate or expression executions the system performs.\\nSuppose we modify our 2-hop query a bit and put two additional filters on the query:\\n```\\nMATCH (a:Account)-[e1:Transfer]->(b:Account)-[e2:Transfer]->(c:Account)\\nWHERE b.name = \'Liz\' AND a.balance > b.balance AND c.balance > b.balance\\nRETURN *\\n```\\nI\'m omitting a plan for this query but a common plan would extend the plan in Figure 2 (or 3) above\\nto also scan the balance properties and to run two filter operations: \\n(i) above the join that joins a\'s and b\'s,\\nto run the predicate `a.balance > b.balance`; (ii) after the final join in Figure 2\\nto run the predicate `c.balance > b.balance`. Suppose the first filter did not eliminate any tuples.\\nThen, a flat processor would evaluate 20K filter executions in the second filter.\\nIn contrast, the input to the second filter operator in a factorized processor \\nwould be the 2 factorized tuples \\nshown in Figure 4 (right) but extended with `balance` properties\\non a, b, and c\'s. Therefore there would be only 200 filter executions: (i) \\nfor the first factorized tuple, there are only\\n100 comparisons to execute `c.balance > b.balance` since b is matched to a single\\nvalue and there are 100 c values.; (ii) similarly for the 2nd factorized tuple.\\nWe can obtain similar benefits when running other expressions.\\n\\n### Aggregations\\nThis is perhaps where factorization yields largest benefits.\\nOne can perform several aggregations directly on factorized tuples using\\n algebraic properties of several aggregation functions. Let\'s\\nfor instance modify our above query to a count(\\\\*) query: Find the number of 2-paths that Liz is \\nfacilitating. A factorized processor can simply count that there are 100\\\\*100 flat tuples in the first\\nfactorized tuple and similarly in the second one to compute that the answer is 20K.\\nOr consider doing min/max aggregation on factorized variables:\\n```\\nMATCH (a:Account)-[e1:Transfer]->(b:Account)-[e2:Transfer]->(c:Account)\\nWHERE b.accID = \'L1\'\\nRETURN max(a.balance), min(c.balance)\\n```\\nThis is asking: find the 2-path money flow that Liz\'s L1 account facilitates from the highest\\nto lowest balance accounts (and only print the balances). If a processor \\nprocesses the 10K 2-paths that L1 is part of in factorized form, then \\nthe processor can compute the max and min aggregations\\nwith only 100 comparisons each (instead of 10K comparisons each). \\n\\nIn short, the benefits of factorizing intermediate results just \\nreduces computation and data copies here and there in many cases.\\nYou can try some of these queries on K\xf9zu and compare its performance on large \\ndatasets with non-factorized systems. \\n\\n## How Does K\xf9zu Perform Factorized Query Processing?\\nThe rest will be even more technical and forms part of the technical meat of our CIDR paper; \\nso continue reading if you are interested in database implementations.\\nWhen designing the query processor of K\xf9zu, we had 3 design goals: \\n1. Factorize intermediate growing join results. \\n2. Always perform sequential scans of database files from disk.\\n3. When possible avoid scanning entire database files from disk.\\n\\n3rd design goal requires some motivation, which I will provide below. Let\'s go one by one.\\n\\n### 1. Factorization \\nK\xf9zu has a vectorized query processor, which is the common wisdom\\nin analytical read-optimized systems. \\n\\nVectorization, in the context of DBMS query processors \\nrefers to the design where operators pass a set of tuples, 1024 or 2048, \\nbetween each other during processing[^2]. Existing vectorized query processors (in fact \\nprocessors of all systems I\'m aware of) pass *a single vector of flat tuples*.\\nInstead, K\xf9zu\'s operators pass (possibly) multiple *factorized vectors of tuples* \\nbetween each other. Each vector can either be *flat* and represent a single value or \\n*unflat* and represent a set of values, which is marked in a field called `curIdx`\\nassociated with each vector.\\nFor example, the first 10K tuples from my running example would be represented\\nwith 3 factorized vectors as on the left and would be passed to the final projection\\nin the query plan in Figure 2.\\nThe interpretation is this: what is passed is the Cartesian product of all sets of\\ntuples in those vectors. Operators know during compilation time how many vector\\ngroups they will take in and how many they will output. Importantly, we still\\ndo vectorized processing, i.e., each primitive operator operates on a vector of values\\ninside tight for loops. \\nCredit where credit\'s due: this simple-to-implement design was proposed \\nby my PhD student [Amine Mhedhbi](http://amine.io/) with some feedback from \\nme and my ex-Master\'s student \\n[Pranjal Gupta](https://www.linkedin.com/in/g31pranjal/?originalSubdomain=in)\\nand [Xiyang Feng](https://www.linkedin.com/in/xingyang-feng-14198491/?originalSubdomain=ca), \\nwho is now a core developer of K\xf9zu. \\nAnd we directly adopted it in K\xf9zu. Amine has continued doing other excellent\\nwork on factorization, which we have not yet integrated, and you\\nwill need to wait until his PhD thesis is out.\\n\\n### 2. Ensuring Sequential Scans\\nI already told you above that \\nExtend/Expand type join operators lead to non-sequential scans of database files.\\nThese operators are not robust and if you are developing a disk-based system:\\nnon-sequential scans will kill you on many queries. That\'s a mistake. Instead, \\nK\xf9zu uses (modified) HashJoins which are much more robust. HashJoins do not perform any scans\\nas part of the actual join computation so if the down stream scans\\nare sequential, you get sequential scans. I\'ll give a simulation momentarily.\\n\\n### 3. Avoiding Full Scans of Database Files\\nAlthough I don\'t like Extend/Expand-type join operators,\\nthey have a performance advantage. Suppose you had a simple 1-hop query that only asked for\\nthe names of accounts that Liz\'s L1 account has transfered money to:\\n```\\nMATCH (a:Account)-[:Transfer]->(b:Account)\\nWHERE a.accID = \'L1\'\\nRETURN b.name\\n```\\nSuppose your database has billions of transfers but L1 has made only 3 transfers to\\naccounts with system-level record/node IDs: 107, 5, and 15. Then if you had\\na linear plan like I showed in Figure 3, then an Extend/Expand-type\\noperator could read these system-level IDs and then only scan\\nthe name properties of these 3 nodes, avoiding the full scan of the names\\nof all Accounts. If your query needs to read neighborhoods of millions of nodes, \\nthis type of computation that \\"reads the properties of each node\'s neighbors\\"\\nwill degrade very quickly because: (i) each neighborhood \\nof each node will require reading\\ndifferent parts of the disk files that store those properties; and (ii)\\nthe system might repeatedly read the same properties over and over from disk,\\nas nodes share neighbors.\\nInstead, you want to\\nread all of the properties and create a hash table and read those properties\\nfrom memory. \\nHowever, if your query is accessing the neighborhoods of a few nodes,\\nthen avoiding the scan of entire database file is an advantage.\\nIn K\xf9zu, we wanted to use HashJoins but we also wanted a mechanism to scan \\nonly the necessary parts of database files. We\\ndo this through a technique called *sideways information passing*[^3]. \\nI\'ll simulate this below.\\n\\n### A Simple Simulation\\nFor simplicity, we\'ll work on a simpler 1-hop query, so the benefits of factorization will not \\nbe impressive but it will allow me to explain an entire query processing pipeline.\\nConsider this count(\\\\*) query that counts the number of transfers the L1 account has made:\\n```\\nMATCH (a:Account)-[t1:Transfer]->(b:Account)\\nWHERE a.accID = L1\\nRETURN count(*)\\n```\\nAn annotated query plan we generate is shown below. The figure shows step by step\\nthe computation that will be performed and the data that will be passed between operators.\\nFor this simulation, I am assuming that the record/nodeIDs of Accounts are as in \\nFigure 1a above.\\n\\n\\n\\n1. A Scan operator will scan the accId column and find the records of\\nnodes with accID=L1. There is only 1 tuple (199, Liz) that will be output.\\n2. This tuple will passed to HashJoin\'s build side, which will create a hash table from it.\\n3. At this point the processor knows exactly the IDs of nodes, whose Transfer edges need\\nto be scanned on the probe side: only the edges of node with ID 199! This is where we \\ndo sideways information passing.\\nSpecifically, the HashJoin constructs and passes a \\"nodeID filter\\" (effectively a bitmap) \\nto the probe side Scan operator. Here, I\'m assuming the database has 1M Accounts but as you \\ncan see only the position 199 is 1 and others are 0.\\n4. The probe-side Scan uses the filter to only scan\\nthe edges of 199 and avoids\\nscanning the entire Transfers file.\\nSince K\xf9zu is a GDBMS, we store the edges of nodes (and their properties) \\nin a graph-optimized format called [CSR](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). \\nImportantly, all of the edges of 199 are stored consecutively and we output them in factorized format as:\\n[(199) X {201, 202, ..., 300}].\\n5. Next step can be skipped in an optimized system but currently we will probe the [(199) X {201, 202, ..., 300}]\\n tuple in the hash table and produce [(199, L1) X {201, 202, ..., 300}], which is passed to the \\n final aggregation operator.\\n6. The agggregation operator counts that there are 100 \\"flat\\" tuples in [(199, L1) X {201, 202, ..., 300}], simply\\n by inspecting the size of the 2nd vector {201, 202, ..., 300} in the tuple.\\n\\nAs you see the processing was factorized, we only did sequential scans\\nbut we also avoided scanning the entire Transfer database file, achieving our 3 design goals.\\nThis is a simplifid example and there are many queries that are more complex and where we \\nhave more advanced modified hash join operators. But the simulation presents all core techniques\\nin the system. You can read our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) \\nif you are curious about the details!\\n\\n### Example Experiment\\nHow does it all perform? Quite well! Specifically this type of processing is quite robust. \\nHere\'s an experiment from our CIDR paper to give a sense of the behavior of\\nusing modified hash joins and factorization on a micro benchmark query. This query \\ndoes a 2-hop query with aggregations on every node variable. This is on \\nan [LDBC](https://ldbcouncil.org/benchmarks/snb/)\\nsocial network benchmark (SNB) dataset at scale factor 100 (so ~100GB of database). LDBC SNB \\nmodels a social network where user post comments and react to these comments. \\n```\\nMATCH (a:Comment)<-[:Likes]-(b:Person)-[:Likes]->(c:Comment)\\nWHERE b.ID < X\\nRETURN min(a.ID), min(b.ID), min(c.ID)\\n```\\nNeedless to say, we are picking this as it is a simple query that can demonstrate\\nthe benefits of all of the 3 techniques above. Also needless to say, we could have exaggerated\\nthe benefits by picking\\nlarger stars or branched tree patterns but this will do.\\nIn the experiment we are changing the selectivity of the predicate on the middle node, which\\nchanges the output size. What we will compare is the behavior of K\xf9zu, which integrates\\nthe 3 techniques above with (i) K\xf9zu-Extend: A configuration of K\xf9zu that uses factorization but instead of\\nour modified HashJoins uses an Extend-like operator;\\nand (ii) [Umbra](https://umbra-db.com/)[^4], which represents the\\nstate of the art RDBMS. Umbra is as fast as existing RDBMSs get. It probably integrates\\nevery known low-level performance technique in the field.\\nUmbra however does not \\ndo factorization or have a mechanism to avoid scanning entire database files, so we\\nexpect it to perform poorly on the above query. \\n\\nHere\'s the performance table.\\n\\nWhen the selectivity is very low, Extend-like operators + factorization do quite well\\nbecause they don\'t yet suffer much from non-sequential scans and they avoid several overheads\\nof our modified hash joins: no hash table creation and no semijoin filter mask creation. \\nBut they are not robust and degrade quickly. We can also see that even if you\'re Umbra, \\nwithout factorization or a mechanism to avoid scanning entire files, \\nyou will not perform very well on these queries with m-n joins (even if there is only 2 of them here). \\nWe conducted several other experiments all demonstrating the robustness and scalability\\nof factorized processing using modified hash join operators. I won\'t cover them but\\nthey are all in [our CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf).\\n\\n## Final marks: \\nI am convinced that modern GDBMSs have to be factorized systems to remain \\ncompetitive in performance. If your system assumes that most joins will be growing,\\nfactorization is one of a handful of modern technique for such workloads \\nwhose principles are relatively well understood\\nand one can actually implement in a system. I am sure different factorized query processors will\\nbe proposed as more people attempt at it. I was happy to see in CIDR that at least 2 systems\\ngurus told me they want to integrate factorization into their systems. \\nIf someone proposes a technique that can on some queries\\nlead to exponential computation reductions even in a pen-and-paper theory, it is a good sign\\nthat for many queries it can make the difference between a system timing out vs providing \\nan actual answer.\\n \\nFinally there is much more on the theory of factorization, which I did not cover. From my side, \\nmost interestingly, there \\nare even more compressed ways to represent the intermediate results than the \\nvanilla Cartesian product scheme I covered in this post. Just to raise some curiosity, what I have \\nin mind is called \\n[d-representations](https://fdbresearch.github.io/principles.html) but that will have to wait \\nfor another time. For now, I invite you to check our performance out on large queries \\nand let us know if we are slow on some queries! The K\xf9zu team says hi (\ud83d\udc4b \ud83d\ude4b\u200d\u2640\ufe0f \ud83d\ude4b\ud83c\udffd) and \\nis at your service to fix all performance bugs as we continue implementing the system! \\nMy next post will be about the novel [worst-case optimal join algorithms](../2023-02-22-wcoj/index.md), which emerged\\nfrom another theoretical insight on m-n joins! Take care until then!\\n\\n[^1]: If you come from a very graph-focused background and/or exposed to a ton of GDBMS marketing, you might react to my statement that what I am showing are standard plans that do joins. Maybe you expected to see graph-specific operators, such as a BFS or a DFS operator because the data is a graph. Or maybe someone even dared to tell you that GDBMSs don\'t do joins but they do traversals. Stuff like that. These word tricks and confusing jargon really has to stop and helps no one. If joins are in the nature of the computation you are asking a DBMSs to do, calling it something else won\'t change the nature of the computation. Joins are joins. Every DBMSs needs to join their records with each other.\\n\\n[^2]: Vectorization emerged as a design in the context of columnar RDBMSs, which are analytical systems, about 15-20 years old. It is still a very good idea. The prior design was to pass a single tuple between operators called Volcano-style tuple-at-a-time processing, which is quite easy to implement, but quite inefficient on modern CPUs. If you have access to the following link, you can read all about it from the pioneers of [columnar RDBMSs](https://www.nowpublishers.com/article/Details/DBS-024).\\n\\n[^3]: Note that GDBMSs are able to avoid scans of entire files because notice that they do the join on internal record/node IDs, which mean something very specific. If a system needs to scan the name property of node with record/node ID 75, it can often arithmetically compute the disk page and offset where this is stored, because record IDs are dense, i.e., start from 0, 1, 2..., and so can serve as pointers if the system\'s storage design exploits this. This is what I was referring to as \\"Predefined/pointer-based joins\\" in my [previous blog post](../2023-01-12-what-every-gdbms-should-do/index.md). This is a good feature of GDBMSs that allows them to efficiently evaluate the joins of node records that are happening along the \\"predefined\\" edges in the database. I don\'t know of a mechanism where RDBMSs can do something similar, unless they develop a mechanism to convert value-based joins to pointer-based joins. See my student [Guodong\'s work last year in VLDB](https://www.vldb.org/pvldb/vol15/p1011-jin.pdf) of how this can be done. In K\xf9zu, our sideways information passing technique follows Guodong\'s design in this work.\\n\\n[^4]: Umbra is being developed by [Thomas Neumann](https://www.professoren.tum.de/en/neumann-thomas) and his group. If Thomas\'s name does not ring a bell let me explain his weight in the field like this. As the joke goes, in the field of DBMSs: there are gods at the top, then there is Thomas Neumann, and then other holy people, and then we mere mortals."},{"id":"what-every-gdbms-should-do-and-vision","metadata":{"permalink":"/docusaurus/blog/what-every-gdbms-should-do-and-vision","source":"@site/blog/2023-01-12-what-every-gdbms-should-do/index.md","title":"What Every Competent GDBMS Should Do (aka The Goals & Vision of K\xf9zu","description":"As a co-implementor of the K\xf9zu GDBMS and","date":"2023-01-12T00:00:00.000Z","formattedDate":"January 12, 2023","tags":[{"label":"vision","permalink":"/docusaurus/blog/tags/vision"}],"readingTime":18.77,"hasTruncateMarker":true,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"what-every-gdbms-should-do-and-vision","authors":["semih"],"tags":["vision"]},"prevItem":{"title":"Factorization & Great Ideas from Database Theory","permalink":"/docusaurus/blog/factorization"},"nextItem":{"title":"Meet K\xf9zu \ud83e\udd17","permalink":"/docusaurus/blog/meet-kuzu"}},"content":"import BachmannImage from \'./bachmann.png\';\\nimport DiamondPatternImage from \'./diamond-pattern.png\';\\nimport ExFwdJoinIndexImage from \'./ex-fwd-join-index.png\';\\nimport KuzuAsGDBMSOfGDSImage from \'./kuzu-as-gdbms-of-gds.png\';\\n\\n\\nAs a co-implementor of the K\xf9zu GDBMS and\\na professor at University of Waterloo,\\nI have been thinking of GDBMSs day in and day out for many years now.\\nAfter years of understanding and publishing on the architectural principles \\nof graph data management ([1](http://www.vldb.org/pvldb/vol12/p1692-mhedhbi.pdf), \\n[2](https://www.vldb.org/pvldb/vol14/p2491-gupta.pdf), \\n[3](https://www.vldb.org/pvldb/vol15/p1011-jin.pdf),\\n[4](https://www.vldb.org/pvldb/vol15/p1533-chen.pdf)),\\nwe decided to develop \\n[K\xf9zu](https://github.com/kuzudb/kuzu) as a state-of-the-art modern embeddable GDBMS. \\nThis post covers my broad opinions on GDBMSs, and the feature set they should\\noptimize for and why. In doing so, it also gives an overall vision of K\xf9zu!\\n\\n\x3c!--truncate--\x3e\\n\\n:::tip Tldr: The key takeaways are:\\n- **Overview of GDBMSs**: GDBMSs are relational in their cores but offer an elegant graph model\\n to model application data and SQL-like query languages with elegant\\n graph-specific syntax. Many applications, e.g., in [fraud detection](https://tinyurl.com/3x89ceum), \\n [recommendations](https://www.tigergraph.com/solutions/recommendation-engine/),\\n [personalization](https://tinyurl.com/3z9bckmm), etc. benefit from such modeling and query language features.\\n- **Key Feature Set of GDBMSs**: Despite being relational, GDBMSs optimize (or at\\n least they should!) for a distinct set of\\n features/use cases that RDBMSs do not traditionally optimize for: (i) pre-defined/pointer-based joins;\\n (ii) growing many-to-many joins;\\n (iii) recursive joins;\\n (iv) schema querying; \\n (v) efficient storage of semi-structured data and URIs.\\n GDBMSs that want to be competitive in terms of performance\\n need to perfect this feature set and that\'s exactly what K\xf9zu aims to do!\\n- **K\xf9zu as the GDBMS for Graph Data Science**: \\n One example application domain the K\xf9zu team is excited about is \\n to be a usable, efficient, and scalable GDBMS of graph data science in the Python graph analytics ecosystem. \\n Here we are looking at how DuckDB revolutionized tabular data science and\\n want to repeat it in graph data science! \\n:::\\n\\n\\nThis week, I presented K\xf9zu to the database community at the [CIDR 2023](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) \\nconference in Amsterdam. For those who are not familiar with academic database conferences, \\nCIDR brings together work from academia and industry to discuss recent research on \\nsystems aspects of database technology. Our paper was about K\xf9zu\'s \\ngoals and vision and its core query processor design for evaluating complex growing joins.\\nWe intentionally targeted CIDR for our paper because of its systems \\nfocus and we thought many system gurus would be there: the attendees included \\ncreators of [MonetDB](https://www.monetdb.org/), [Vectorwise](https://en.wikipedia.org/wiki/Vectorwise), \\n[DuckDB](https://duckdb.org/), \\n[Snowflake](https://www.snowflake.com/en/), [Databricks](https://www.databricks.com/), amongst others. It also meant a lot to share \\nour ambitious goal of developing a usable GDBMS from an academic setting in this CIDR because\\nit was organized locally by CWI. The late [Martin Kersten](https://en.wikipedia.org/wiki/Martin_L._Kersten) \\nfounded the CWI database group and was a pioneer of this style of research projects and \\nhis successors are continuing this tradition very successfully today. \\nCWI has created many successful DBMSs, including MonetDB (Martin\'s legacy), Vectorwise, and \\nmost recently DuckDB. People paid their respects to Martin during an emotional memorial \\non the first night of the conference.\\nAs a surprise, [MemGraph](https://memgraph.com/) co-founder and CTO [Marko Budiseli\u0107](https://www.linkedin.com/in/markobudiselic/) \\nwas also there (it was his first CIDR)! Marko is an extremely friendly \\nand humble person you should meet and it was great to share our insights about where GDBMSs make a difference in \\nenterprise applications.\\n\\nI want to start a 3-part blog post to cover the contents of our CIDR paper in a less academic language: \\n- Post 1: K\xf9zu\'s goals and vision as a system \\n- Post 2: [Factorization technique for compression](../2023-01-20-factorization/index.md)\\n- Post 3: [Worst-case optimal join algorithms](../2023-02-22-wcoj/index.md)\\n\\nIn this Post 1, I discuss the following: \\n (i) [an overview of GDBMSs](#overview-of-gdbms-and-a-bit-of-history).\\n (ii) [the features GDBMSs should optimize for and why;](#features-every-competent-gdbms-should-optimize-for-) and \\n (iii) [an example application domain (graph data science!) we are immediately targeting with K\xf9zu. ](#k\xf9zu-as-a-gdbms-for-graph-data-science-pipelines)\\n(ii) and (iii) should give you a good idea about the current goals and \\nvision of K\xf9zu. If you know GDBMSs well, you should skip over (i).\\n\\n## Overview of GDBMSs and a Bit of History \\nIn one sentence, GDBMSs are read-optimized analytical DBMSs for modeling and querying application \\ndata as a graph. As a consequence they are optimized for fast querying of node and \\nrelationship records. \\nModern GDBMSs, such as Neo4j, Tigergraph, MemGraph, or K\xf9zu, \\nadopt the [property graph data model](https://neo4j.com/developer/graph-database/#property-graph)\\n(or its variants), where you can model your records as a set of labeled nodes and \\nedges/relationships, and key-value properties on these relationships. When\\nI say GDBMSs in this post, I specifically refer to the systems that adopt this\\nmodel but I will also discuss [RDF systems](https://en.wikipedia.org/wiki/Triplestore) (aka triplestores) \\nhere and there, which are also DBMSs that adopt a graph-based model.\\n\\nHere\'s a side comment that I have to make because I\'m a professor and\\nprofessors are always ready to profess.\\nDBMSs based on graph models are anything but new. They have existed even before the relational\\nmodel: DBMS die-hards love remembering \\nthat the [IDS system](https://en.wikipedia.org/wiki/Integrated_Data_Store) from 1960s was based on the \\"network model\\",\\nwhich is is just another term for graph. IDS was lead by the amazing \\nCharlie Bachmann, [1](https://amturing.acm.org/award_winners/bachman_9385610.cfm),\\n[2](https://youtu.be/iDVsNqFEkB0), [3](https://youtu.be/jByIpJNrm50)), whose photo is shown on the left and who is credited for inventing DBMSs[^1].\\nIf you click on [this 1962 ad of the IDS system](http://wp.sigmod.org/wp-content/uploads/2012/12/image4.jpg), you will see a graph of node and \\nedge records. Note 1960s are pre-relational times. Ever since, every decade has seen a surge of DBMSs \\nthat adopted a graph-based model with mixed levels of adoption success:\\nhierarchical model, XML, and RDF are examples.\\nIn my view, current property GDBMSs is the most generic\\nand suitable to model a very broad range of application data out of these.\\nSo they probably established themselves most successfully out of these. \\nThere is a very fundamental reason why graph-based DBMSs have always existed and will\\nalways exist: graphs and tables are the two most natural and generic abstract data structures \\nto model application data. It\'s no surprise they were the first two proposed data models\\nwhen the field of DBMSs were born and both have existed ever since and will continue to exist.\\n\\nBack to property GDBMSs. How about their query languages? They support SQL-like high-level \\nquery languages with several graph-specific syntax. \\nI call them \\"graph-specific\\" SQL. Let\'s look at a query snippet. Assume this is\\non a database that models a set of financial \\"accounts\\" and money \\"transfers\\"\\nbetween accounts:\\n\\n```\\nMATCH (a:Account)-[e:Transfer]->(b:Account)\\nWHERE a.name = \'Alice\'\\nRETURN b.ID\\n```\\nThis is a query expressed in Cypher. Instead of a SELECT/FROM/WHERE, \\nyou are looking at MATCH/WHERE/RETURN. \\nIf intelligent Martians saw Cypher and SQL, their immediate reaction \\nwould not be to notice the minor syntactic differences but instead\\nthe fundamental similarities: their clauses describe joins,\\nfilters, projections, group by and aggregates, and other relational \\noperations that process sets of tuples.\\nThere is of course syntactic differences that are important. Query languages of \\nGDBMSs adopt graph-specific syntax that are often very elegant to express several computations.\\nFor example, the arrow syntax ((a)-[e]->(b)) in Cypher describes joins between node records. This\\nis much more elegant than listing names of tables that model \\nnode records in a FROM clause, with a complex WHERE clause. \\nMuch more importantly, they adopt a very elegant and direct syntax,\\nsuch as the Kleene star \\"*\\", to \\nexpress recursive queries. Expressing recursive computations with vanilla SQL is \\nobjectively harder. I\'ll come to recursive queries later.\\n\\n\\nNow get ready for a blasphemous observation: *GDBMSs are relational at their cores!*[^2]. \\nWell, OK anyone who has studied the principles of DBMSs knows there is nothing \\nblasphemous here because GDBMSs actually have to be relational\\nbecause of this simple fact: \\n*the only known practical way to implement declarative high-level\\nquery languages is to compile them to relational operators that\\ntake in and output sets of tuples*. Type \\"Explain\\" to any of your\\nqueries in your favorite GDBMs (or RDF system) and look at their query plans and\\nyou will see joins, scans, filters, projections, group bys, unions,\\nintersections, etc. You might see some graph-specific operators\\nbut they will also be processing sets of tuples. That was the primary\\nobservation of [Ted Codd](https://en.wikipedia.org/wiki/Edgar_F._Codd) when he proposed\\nthat data management should be done by systems implementing\\nrelational operators that process sets of tuples. \\n\\nBut don\'t worry, I do love GDBMSs and you should too! The fact that at their cores \\nGDBMSs are relational doesn\'t mean they don\'t offer value beyond RDBMSs.\\nDBMSs are very complex software systems and they make a ton of design tradeoffs in terms of\\nwhat they optimize for. There is a very distinctive set of technical features that \\nGDBMSs should optimize for and excel in, where RDBMSs and SQL traditionally don\'t.\\nThis feature set is exactly what \\nK\xf9zu aims to perfect over time, which is what I hope to articulate in this post.\\nIn short: GDBMSs do offer a ton of value if \\nthey are architected correctly and every software engineer should know \\nabout GDBMSs[^3].\\n\\n## Features Every Competent GDBMS Should Optimize For [^4]\\nHere is a list of features that differentiate GDBMSs from RDBMSs and GDBMS should\\nhighly optimize for and support.\\n\\n### Feature 1: Pre-defined/pointer-based Joins\\nThis is perhaps the most ubiquitously adopted technique in GDBMSs that is ubiquitously missing in RDBMSs. \\nAlthough GDBMSs\\ncan join arbitrary node records with each other, most common user queries in GDBMSs\\njoin node records with their \\"neighbors\\". A GDBMS knows about these\\nneighbor node records because they are predefined to the system as relationships.\\nSo GDBMSs universally exploit this and optimize for these types of joins. For example,\\nalmost universally they all create a **join index** (aka an adjacency list index)[^5].\\nHere\'s a demonstrative example showing a \\"forward\\", i.e., from src to dst, join index:\\n\\n\\n\\n\\nNote that the join index does not store the actual data values, which\\nare strings (e.g., \\"Ali\\", \\"Noura\\", etc.) in the example. Instead, \\nit stores dense system level node record IDs.\\nAs a result, GDBMSs can be fast on these joins because they can use: (1) the join index;\\nand (2) dense integer IDs to joins (instead of, say running string equality conditions). \\n\\n### Feature 2: Many-to-many Growing Joins\\nIn many application data stored on GDBMSs, node records\\nhave many-to-many relationships with each other. Think of any data as a graph, \\nsay a network of financial transactions or who bought which items or\\nwho is friends with whom. In many of these datasets, an entity/node connects with \\nmany other nodes. In addition, many of the killer apps of GDBMSs search for complex patterns\\non these relationships. \\nA classic example we like using is a Twitter friend recommendation engine that is looking for diamond patterns to implement\\nthe following rule: If a user A follows two users B and C, who both follow D, recommend\\nD to A. This is the pattern:\\n\\n
\\n\\n
\\n\\n\\nThe whitepapers of existing GDBMSs are full of these patterns, e.g., branching trees, money laundering circles,\\ncliques of customers who buy similar items, etc. These correspond to complex\\nmany-to-many joins, which by their nature are growing. If on average each of your nodes \\nconnect with k other nodes and you have t many relationships in the pattern you are searching,\\nyou are asking a system to search through k^t many possible combinations and guess what: exponential \\nfunctions are scary. We have been advocating the integration of 2 specific techniques\\ninto the query processors of GDBMSs for several years now: (i) factorization; and (ii) worst-case optimal joins.\\nBoth of these techniques are specifically designed for \\nmany-to-many growing joins and we have integrated them in K\xf9zu. Stay tuned for for my next two posts on this. \\n\\n### Feature 3: Recursive Join Queries\\nThis is probably the most obvious feature where GDBMSs should excel in. First, objectively \\nthe query languages of GDBMSs have much better support\\nfor recursive join queries than SQL. Consider this query on our previous financial transaction network\\nexample: \\"Give me all direct or indirect money flows into Alice\'s account from Canada.\\" Now\\nlook at this elegant way to ask this in Cypher using the Kleene star \'\\\\*\':\\n```\\nMATCH (a:Account)-[:Transfer*]->(b:Account)\\nWHERE a.country = \'Canada\' and b.name = \'Alice\'\\nRETURN a.ID\\n```\\n\\nSimilar to regexes, \'\\\\*\' represents possible 1 or more repetitions of the Transfer\\nedge in the join. So the join could be a direct join between (a) and (b) or a 2-hop one,\\nor a 3-hop one etc. You can do this with SQL of course, but it\'s objectively harder. Recursion\\nhas been an afterthought when standardizing SQL. It came 20 years after SQL standardization started and is really a hack. \\nIn contrast, recursion has been first-class citizen\\nfeature in every graph-based DBMS\'s query language.\\nThis distinction is even much more visible\\nif you want to do other graph-specific recursive computation, such as finding shortest paths.\\nIn K\xf9zu, we are starting to work on implementing \\nand optimizing recursive query support and we hope to have first a basic version and \\nthen optimized versions that hopefully works very well and contributes to the principles of how these\\nqueries should be evaluated.\\n\\n### Feature 4: Schema Querying \\nAnother important feature of GDBMSs that cannot be done in\\nRDBMSs is that the query languages allow querying the schema of a database in addition\\nto the data in the database. Suppose in a modified financial transaction network, \\nthere are three relationship types: Wire, Deposit, and ETransfer and you \\nyou wanted to search for a path where the first edge and the second edge types\\nare different. Note that the predicate is *on the schema*, specifically on the type \\nof the nodes/relations. You can write the following query:\\n```\\nMATCH (a:Account)-[e1]->(b:Account)-[e2]->(c:Account)\\nWHERE type(e1) != type(e2)\\nRETURN *\\n```\\n\\nSomething akin to this cannot directly be done in SQL. One would have to write a query\\nthat unions many sub-queries: one that joins node records over Wire and then Deposit,\\nanother on Wire and ETransfer, another on Deposit and then Wire etc. This will be \\nmessy. The ability to *not* specify a label on relationships, \\nspecifically on e1 and e2, is an\\nelegant way to effectively express such unions of join queries.\\nIt says: \\"join a and b nodes over every possible relationship\\".\\nThe `type()` function on these variables allows doing querying over the schema.\\n\\n### Feature 5: Semi-structured Data and URI-heavy Datasets (e.g., \\"Knowledge Graphs\\")\\nAn important application domain of GDBMSs \\nis \\"knowledge graphs\\". This term means different things \\nin different contexts and I\'ll take it\\nto refer to highly heterogenous datasets that are\\noften naturally modeled as RDF triples. Again, I don\'t want to go into the \\ndetails of this model but I assume many readers will already be familiar with\\nRDF. RDF is a simple data model where data is represented as (subject, predicate, object)\\ntriples that represent facts about a domain. A great application is when modeling and\\nquerying encyclopedic facts, such as those extracted from Wikipedia data.\\nFor example, the following triple stores the fact\\nthat Justin Trudeau is married to Sophie Trudeau:\\n(http://dbpedia.org/resource/Justin_Trudeau, http://dbpedia.org/ontology/spouse,\\t\\nhttp://dbpedia.org/resource/Sophie_Gr\xe9goire_Trudeau). \\nThere are 2 immediate challenges for a DBMS to manage \\nsuch data: \\n1. Structuring such datasets is very difficult. Structuring here\\nrefers to designing a relational schema for the data.\\nEntities can have many types, e.g., Justin Trudeau is both a \\"rdf:type\\" \\nhttp://dbpedia.org/ontology/Person as well as\\nhttp://dbpedia.org/ontology/Politician. Further, within a single type, entities can have many different\\nand distinct properties, so good luck coming up with and maintaining a relational \\nschema for all that. \\nThis is a direct result of\\nthe overly ambitious domain the dataset is modeling: all encyclopedic human knowledge!\\nYou need a data model that allows flexibility in what can be associated with entities\\nand their types[^6].\\n\\n2. Those long strings used to identify entities, e.g., Justin\\nTrudea, are called URIs (for universal resource identifiers),\\nand queries will frequently access and specify them. So systems should\\nbe competent in handling those.\\n\\nGDBMSs tend to support semi-structured schemas and certainly RDF systems\\nhave good techniques to handle URIs. \\nThese applications are directly in the realm of graph-based DBMSs.\\nCurrently, they are directly targeted by RDF systems but I\'m convinced \\nGDBMSs should also implement techniques to efficiently support them[^7]. \\n\\n**Final note on the above feature set:** I referred to several classic applications but \\nmany other applications require and benefit\\nfrom the above feature set. One can\\nthink of the dataset and workloads of these applications as the \\"beyond relational/SQL\\" datasets/workloads, which\\noften require modeling and querying in a graph-based DBMS, and\\nwe want K\xf9zu to excel in and represent the state-of-art in this feature set! \\n\\n## K\xf9zu as a GDBMS for Graph Data Science Pipelines\\n\\nFinally, let me tell you a little bit about \\na particular application domain we are currently excited\\nabout and we want to see K\xf9zu used in: graph data science in the python ecosystem!\\nThis figure from my CIDR slides describes this vision pictorially:\\n\\n![K\xf9zu as a GDBMS for Graph Data Science Pipelines](./kuzu-as-gdbms-of-gds.png)\\n\\nSuppose you are building a graph analytics, machine learning, or visualization\\npipeline from raw record files on disk. You will want to model your raw records \\nas nodes and edges, clean them, extract features, query them, transform them, \\nand then you will extract data to an upstream python library, such as Pytorch Geometric, DGL, \\nNetworkX or a graph visualization library. \\nYou might even want a pipeline\\nthat extracts regular tables from your graphs to a tabular data science library, \\nsuch as NumPy,\\nsince the outputs of queries in Cypher are tables of records.\\nWe want people to use K\xf9zu as an embeddable library in their Python scripts, \\nto do their modeling, querying, feature extraction, \\ncleaning, and other transformations, all by benefiting from a high-level query language \\nand state-of-art graph data management techniques\\nthat we are implementing. This is exactly what DuckDB did for tabular data science/analytics.\\nWe are looking at DuckDB here and want to fill the same gap for graph data science/analytics!\\nWe are currently understanding the ecosystem better and appreciate feedback\\nand suggestions for features we should implement to enable your workloads.\\n\\nOK, this is it for now. In the next two blog posts, I will discuss \\nfactorization and worst-case optimal join algorithms and describe \\nsome of the principles that we adopted in K\xf9zu\'s query processor.\\nUntil then, happy new years from the cold but cozy winter of \ud83c\udde8\ud83c\udde6 \\nand [pip install kuzu](https://github.com/kuzudb/kuzu)!\\n\\n[^1]: Interestingly, Bachmann is one of a handful of Turing laureates without any academic career. If you love DBMSs, [listen to this talk](https://youtu.be/iDVsNqFEkB0) where he remembers his IDS days! Amusingly, he also talks about how he didn\'t know who Turing was when got the Turing award and how he met Turing\'s mother in England for tea \ud83d\ude00.\\n\\n[^2]: When I say GDBMSs here, I\'m referring to the core engines that implement the high-level languages of these systems and not the analytics libraries (e.g., [1](https://neo4j.com/product/graph-data-science/), [2](https://memgraph.com/mage)) above these core engines that run iterative graph analytics computations, such as finding connected components, or PageRank, or betweenness centrality. These computations are better understood through either direct graph formalisms or linear algebra (and not relational) operations.\\n\\n[^3]: I am a strong supporter of devoting a few lectures to GDBMSs after covering the fundamental topics on the relational model and RDBMSs in core introduction to DBMSs courses in undergraduate curriculums. Students should broaden their perspectives on the available data models and query/programming languages to them when they develop applications. GDBMSs is an obvious choice here. So is Datalog and RDF/SparQL.\\n\\n[^4]: We articulated this list of features in our CIDR 2023 paper. Incidentally, [a paper](https://www.cidrdb.org/cidr2023/papers/p66-wolde.pdf) written by CWI on a graph query extension to DuckDB, had a 12-item list of \\"techniques\\" that GDBMSs should implement at their cores. Let me call this the CWI list. These items are not features in the sense I\'m using the word, so I call them techniques. As you\'ll see my features are higher-level system properties from user\'s perspective. Peter Boncz, who is renowned in the field for having written or advised many successful DBMSs that spinned off, presented the CWI paper. I highly recommend this as another reading if you want to know more about Peter and his co-authors\' technical insights about how GDBMSs should be architected. Importantly, K\xf9zu has integrated or is in the process of integrating 11 of the 12 techniques in the CWI list(bulk path finding is the one we have to do more thinking on) and our prior publications had also articulated many of these insights, such as the fact that [GDBMSs should be columnar systems](https://www.vldb.org/pvldb/vol14/p2491-gupta.pdf) doing vectorized querying and of course we did a ton of work on [worst-case optimal joins](https://www.vldb.org/pvldb/vol12/p1692-mhedhbi.pdf) and [factorization](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf), which are also in the CWI list. I should acknowledge that Peter had been advocating for some of the techniques on the CWI list at least since 2018. I remember a presentation he gave in 2018 to GDBMSs researchers and developers titled \\"Why are Existing GDBMSs Incompetent?\\", which listed some of the techniques in the CWI list and visibly has inspired the title of this blog.\\n\\n[^5]: Although some refer to these as an \\"adjacency list index\\" because that\'s a common term in graph terminology, I need to pay my respects to the giants in the field: these are plain old [1980s Valduriez join indices](https://dl.acm.org/doi/abs/10.1145/22952.22955). And no, they were invented in the context of RDBMSs. That said, they never found much adoption in RDBMSs. But they are almost universally adopted in GDBMSs.\\n\\n[^6]: Designing the schema, i.e., defining the types of entities and relationships and class structures and constraints of such complex domains can be decades of work. What I\'m referring to as schema is called an \\"ontology\\" in knowledge graph/semantic web space. If you ever thought you modeled a hard application domain, take a look at [SNOMED](https://en.wikipedia.org/wiki/SNOMED_CT), which is a decades long effort to model and standardize human medical knowledge. Last term, I had a seminar on SNODEM in my graduate course on knowledge graphs and students were baffled by the complexity of this \\"ontology\\", which describes the types of entities and their relationships and constraints, which RDF technology stack is quite good at.\\n\\n[^7]: Before we released K\xf9zu, we had support for adding arbitrary node/edge properties but we removed a large chunk of code out of the system to release a thinner code base. So currently you need to specify a schema for your nodes and relationships in K\xf9zu. We will wait and see if/when that demand comes and how strongly it comes. We know from our conversations with many users and developers of GDBMSs over the years that most datasets in enterprises are not this complex and can be structured. At least after a proof of concept phase of applications, developers structure their data."},{"id":"meet-kuzu","metadata":{"permalink":"/docusaurus/blog/meet-kuzu","source":"@site/blog/2022-11-15-meet-kuzu.md","title":"Meet K\xf9zu \ud83e\udd17","description":"Today we are very excited to make an initial version of K\xf9zu public on github!","date":"2022-11-15T00:00:00.000Z","formattedDate":"November 15, 2022","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":2.11,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"meet-kuzu","authors":["team"],"tags":["release"]},"prevItem":{"title":"What Every Competent GDBMS Should Do (aka The Goals & Vision of K\xf9zu","permalink":"/docusaurus/blog/what-every-gdbms-should-do-and-vision"}},"content":"Today we are very excited to make an initial version of [K\xf9zu public on github](https://github.com/kuzudb/kuzu)! \\nK\xf9zu is a new embeddable property graph database management system (GDBMS) that is \\ndesigned for high scalability and very fast querying. We are releasing \\nK\xf9zu today under a permissible MIT license. Through years of research on GDBMSs, we observed a lack of\\nhighly efficient GDBMS in the market that adopts state-of-the-art \\nquerying and storage techniques and that can very easily integrate into applications, \\nsimilar to DuckDB or SQLite. K\xf9zu aims to fill this space and evolve into the \\ngo-to open-source system to develop\\ngraph database applications, e.g., to manage and query your knowledge graphs, \\nand develop graph machine learning and analytics pipelines, \\ne.g., in the Python data science ecosystem. \\n\\n\x3c!--truncate--\x3e\\n\\nK\xf9zu\'s core architecture is informed by 6 years of research we conducted \\nat University of Waterloo on an earlier prototype GDBMS called [GraphflowDB](http://graphflow.io/). \\nUnlike GraphflowDB, which was intended to be a prototype for our research, K\xf9zu aims to be\\na usable feature-rich system. Some of the primary features of K\xf9zu\'s architecture are:\\n - Flexible Property Graph Data Model and Cypher query language\\n - Embeddable, serverless integration into applications\\n - Columnar disk-based storage\\n - Columnar sparse row-based (CSR) adjacency list/join indices\\n - Vectorized and factorized query processor\\n - Novel and very fast join algorithms\\n - Multi-core query parallelism\\n - Serializable ACID transactions\\n\\nWhat we are releasing today includes many of the features of the core engine. This is what we\\ncalled the \\"Phase 1\\" of the project. In the next \\"Phase 2\\" of the project, as we continue adding \\nmore features to the core engine, e.g., better support for ad-hoc properties, string compression,\\nand support for new recursive queries, we will also be focusing developing around the core engine\\nto more easily ingest data into the system and output data to downstream data science/graph data science\\nlibraries. You can keep an eye on our tentative [roadmap here](https://github.com/kuzudb/kuzu/issues/981). \\nYou can also read more about some of our longer term goals and vision as a system\\nin [our new CIDR 2023 paper](https://cs.uwaterloo.ca/~ssalihog/papers/kuzu-tr.pdf), \\nwhich we will present in Amsterdam next January. \\n\\n*And most importantly please start using K\xf9zu, tell us your feature requests, use cases, and report bugs. We can evolve into a\\nmore stable, usable, and feature-rich system only through your feedback!* \\n\\nWe are looking forward to to your feedback and a long and exciting journey as we continue developing K\xf9zu \ud83e\udd17. \\n\\n*ps: For interested readers: the word k\xf9-zu is the Sumerian (the oldest known human language) word for \\"wisdom\\".*"}]}')}}]); \ No newline at end of file diff --git a/docusaurus/assets/js/e506623f.9ae97683.js b/docusaurus/assets/js/e506623f.9ae97683.js new file mode 100644 index 000000000..279c0ce8a --- /dev/null +++ b/docusaurus/assets/js/e506623f.9ae97683.js @@ -0,0 +1 @@ +"use strict";(self.webpackChunkkuzu_docs=self.webpackChunkkuzu_docs||[]).push([[4922],{9041:e=>{e.exports=JSON.parse('{"blogPosts":[{"id":"llms-graphs-part-1","metadata":{"permalink":"/docusaurus/blog/llms-graphs-part-1","source":"@site/blog/2024-01-04-llms-graphs-part-1/index.md","title":"RAG Using Structured Data: Overview & Important Questions","description":"During the holiday season, I did some reading on","date":"2024-01-04T00:00:00.000Z","formattedDate":"January 4, 2024","tags":[{"label":"use-case","permalink":"/docusaurus/blog/tags/use-case"}],"readingTime":24.84,"hasTruncateMarker":false,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"llms-graphs-part-1","authors":["semih"],"tags":["use-case"]},"nextItem":{"title":"K\xf9zu 0.1.0 Release","permalink":"/docusaurus/blog/kuzu-0.1.0-release"}},"content":"import QAOverEnterpriseData from \'./qa-over-enterprise-data.png\';\\nimport RAGUsingStructuredData from \'./rag-using-structured-data.png\';\\nimport TwoSQLGenerationApproaches from \'./two-sql-generation-approaches.png\';\\n\\n\\n\\nDuring the holiday season, I did some reading on\\nLLMs and specifically on the techniques that use LLMs together with graph databases and knowledge graphs.\\nIf you are new to the area like me, the amount of activity on this topic on social\\nmedia as well as in research publications may have intimidated you. \\nIf so, you\'re exactly my target audience for this new blog post series I am starting.\\nMy goals are two-fold: \\n1. *Overview the area*: I want to present what I learned with a simple and consistent terminology and at\\na more technical depth than you might find in other blog posts. I am aiming a depth similar to what I aim when preparing\\na lecture. I will link to many quality and technically satisfying pieces of content (mainly papers since the area is very researchy).\\n2. *Overview important future work*: I want to cover several important future works in the space. I don\'t\\nnecessarily mean work for research contributions but also simple approaches to experiment with if you are\\nbuilding question answering (Q&A) applications using LLMs and graph technology.\\n\\n\\nThis post covers the topic of retrieval augmented generation (RAG) using structured data. Then, in a follow up post, \\nI will cover RAG using unstructured data, where\\nI will also mention a few ways people are building RAG-based Q&A\xa0systems that use both structured and unstructured data.\\n\\n:::tip TL;DR: The key takeaways from this post are:\\n- **RAG overview**: RAG is a technique to fill the knowledge gap of LLMs using private data. RAG systems\\n use private structured records stored in a database and/or unstructured data in text files. \\n- **Impressive simplicity and effectiveness of developing a natural language interface over your database using LLMs**: In the pre-LLM era, the amount of engineering effort\\n to develop a pipeline that delivered a natural language interface over your database was *immense*. The\\n hard problem was to teach a model to *speak* SQL, Cypher, or SPARQL.\\n This contrasts sharply with the simplicity of developing similar pipelines now because LLMs already \\"speak\\" these languages. \\n The hard task now is for *developers to learn how to prompt LLMs* to get correct database queries. Furthermore, there is\\n evidence that LLMs, if prompted correctly, will generate a decent proportion of queries with impressive accuracy. \\n- **Lack of work that studies LLMs\' ability to generate Cypher or SPARQL:** Most technically-deep work on understanding\\n LLMs\' ability to generate accurate high-level query languages is on SQL. We need more\\n work understanding the behavior of LLMs on the query languages of GDBMSs (like Cypher or SPARQL), specifically on recursive and union-of-join queries.\\n- **Studying the effects of data modeling (normalization, views, graph modeling) on the accuracy of LLM-generated queries is important:**\\n Many people are studying heuristics for prompting LLMs to increase their efficiency focusing on the syntax and the structure of providing\\n the schema and selection of examples in the prompt. An important and under-studied\\n problem is the effects of data modeling choices on the accuracy of the queries generated by LLMs. I point to [one interesting paper](https://arxiv.org/pdf/2311.07509.pdf) in this space and raise several questions related to\\n normalizations and use of views in relational modeling and comparisons with graph modeling approaches. \\n:::\\n\\n## Killer App: Retrieval Augmented Generation\\n\\nLet\'s review the killer application of LLMs in enterprises.\\nThe application is ultimately Q&A over private enterprise data. Think of a chatbot to which you \\ncan ask natural language questions ($Q_{NL}$), such as: \\"Who is our top paying customer from Waterloo?\\",\\nor \\"What are data privacy regulations in Canada we need to comply with?\\"\\nand get back natural language answers ($A_{NL}$).\\nLLMs, out of the box, cannot answer these questions because they have a *knowledge gap*.\\nFor example, LLMs never had any access to your sales records when they were trained. \\nTherefore, they need to retrieve or be provided with \\nextra information from private data sources of the enterprise.\\n\\n### A note on the term RAG\\nThere seems to be tremendous interest in building systems that combine a traditional \\ninformation retrieval component, e.g., one that looks up some documents from\\nan index, with a natural language generator component, such as an LLM. The term for such systems is \\n*Retrieval Augmented Generation* (RAG).\\nThe term is coined in [this paper](https://arxiv.org/pdf/2005.11401.pdf) to refer\\nto the method of fine-tuning an LLM with additional information, i.e.,\\nusing this additional data to train a new variant of the LLM. \\nThe original usage form in the paper is \\"RAG models\\". Nowadays it is used in a variety of ways, \\nsuch as, \\"RAG system\\", \\"RAG-based system\\", \\"RAG does X\\", or \\n\\"Building RAG with Y\\". RAG often does not refer to fine-tuning LLMs any more. Instead, it \\nrefers to providing LLMs with private data along with the question to fix the knowledge gap.\\nEven systems that simply use an LLM to convert a \\n$Q_{NL}$ to SQL or Cypher query and simply return the results of the query\\nare called \\"RAG systems\\" in some documentations. I will use the term in this broader sense.\\n\\nYou can build RAG-based Q&A systems by using structured and/or unstructured\\ndata. The high-level views of these systems look like this:\\n\\n
\\n\\n
\\n\\n## RAG Using Structured Data: Text-to-High-level-Query\\n*Note: If you are familiar with how to develop RAG systems with LangChain and LlamaIndex, you can directly skip\\nto the \\"[How Good are LLMs in Generating High-level Queries](#how-good-are-llms-in-generating-high-level-queries)\\" part that \\nreflects on the reading I did on RAG using structured data.*\\n\\n### Overview\\nMany blog posts and several papers concern Q&A systems that simply convert\\n$Q_{NL}$ to a high-level query languge, such as SQL, Cypher, or SPARQL, using an LLM.\\nThe figure below describes the overall approach:\\n\\n
\\n\\n
\\n\\n$Q_{NL}$, the schema of a database, and optionally\\nsome example natural language question and high-level query examples, are given\\nto the LLM as a prompt. \\nThe terms \\"no shot\\", \\"one shot\\", or \\"few shot\\" refer to the number of examples provided\\nin the prompt. Depending on the underlying database, the schema may contain\\ncolumns of relational tables and their descriptions, or labels of nodes and edges\\nof a graph database. Using $Q_{NL}$, the database schema, and optionally\\nsome examples, the LLM generates \\na database query, such as SQL or Cypher. The system runs this query against the\\nDBMS and returns back the query result or using the LLM again, converts \\nthe query result back to a natural language answer $A_{NL}$. \\n\\n**Let us pause here to appreciate one thing:** For many decades, the database community has studied the problem\\nof converting $Q_{NL}$ to SQL (aka \\"text-to-SQL\\"). Here is a good recent [survey paper](https://link.springer.com/article/10.1007/s00778-022-00776-8)\\nthat covers only the deep network-based approaches and [a more extensive survey/book](https://www.nowpublishers.com/article/Details/DBS-078)\\non the broader topic of natural language interfaces to databases.\\nNeither of these surveys cover any work that directly uses LLMs such as GPT models, \\nwhich are quite recent developments. Take any of the work covered in these surveys and \\nyou\'ll find an approach that requires significant engineering to build the pipeline shown in the above figure. \\nThere exist several pre-LLM text-to-SQL systems (e.g., [ATHENA](https://www.vldb.org/pvldb/vol9/p1209-saha.pdf)\\nor [BELA](https://download.hrz.tu-darmstadt.de/pub/FB20/Dekanat/Publikationen/UKP/76500354.pdf)). \\nFor example, most of the pre-LLM approaches that use deep learning require\\nhard work *to teach a model how to \\"speak\\" SQL* using large \\ncorpora of tables and (question, query) examples, such as [WikiSQL](https://arxiv.org/abs/1709.00103) or [Spider](https://github.com/taoyds/spider).\\nPeople had to solve and glue-together solutions to many technical problems, such as parsing the question,\\nentity detection, synonym finding, string similarity, among others. \\nPost-LLM approaches require *none* of these efforts because LLMs, such as GPT-4, already speak SQL, Cypher, and SPARQL out of the box, having been exposed to them in their pretraining. \\nNowadays, the hard problem now is for developers *to learn how to prompt LLMs* so that \\nLLMs generate correct queries. I\'ll say more about this problem. In contrast, building the above pipeline requires much less effort as\\nI\'ll show next.\\n\\n### Simplicity of Developing RAG Systems: LangChain and LlamaIndex\\nIf you have been following the developments in the LLM space, you will not be surprised to hear that nowadays people build \\nQ&A systems that convert $Q_{NL}$ to a high-level query language using two common tools:\\n(i) [LangChain](https://www.langchain.com/); and (ii) [LlamaIndex](https://www.llamaindex.ai/).\\nThe same tools also integrate with the underlying storage system to load and retrieve your data. To make this more concrete, let me review the [K\xf9zu-LangChain integration](https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa), similar to the integrations found in other GDBMSs. You as a programmer have very little to do: you prepare your K\xf9zu\\ndatabase `db` and load your data into it, wrap it around a `KuzuGraph` and `KuzuQAChain` objects in Python and you have\\na text-to-Cypher pipeline:\\n\\n```python\\nimport kuzu\\nfrom langchain.chains import KuzuQAChain\\nfrom langchain_community.chat_models import ChatOpenAI\\nfrom langchain_community.graphs import KuzuGraph\\n\\ndb = kuzu.Database(\\"test_db\\")\\n... // create your graph if needed\\ngraph = KuzuGraph(db)\\nchain = KuzuQAChain.from_llm(ChatOpenAI(temperature=0), graph=graph, verbose=True)\\nchain.run(\\"Who played in The Godfather: Part II?\\")\\n```\\nI am following the example application in this [documentation](https://python.langchain.com/docs/use_cases/graph/graph_kuzu_qa), \\nwhich uses a database of movies, actors, and directors. \\n\\n```bash\\nOutput:\\n> Entering new chain...\\nGenerated Cypher:\\nMATCH (p:Person)-[:ActedIn]->(m:Movie {name: \'The Godfather: Part II\'}) RETURN p.name\\nFull Context:\\n[{\'p.name\': \'Al Pacino\'}, {\'p.name\': \'Robert De Niro\'}]\\n\\n> Finished chain.\\n\\n\'Al Pacino and Robert De Niro both played in The Godfather: Part II.\'\\n```\\nThe \\"chain\\" first generated a Cypher query using $Q_{NL}$. \\nBehind the curtain, i.e., inside the KuzuQAChain code, \\na GPT model was given the following prompt:\\n\\n```bash\\nGenerate Cypher statement to query a graph database.\\nInstructions:\\nUse only the provided relationship types and properties in the schema.\\nDo not use any other relationship types or properties that are not provided.\\n\\nSchema:\\nNode properties: [{\'properties\': [(\'name\', \'STRING\')], \'label\': \'Movie\'}, {\'properties\': [(\'name\', \'STRING\'), (\'birthDate\', \'STRING\')], \'label\': \'Person\'}]\\nRelationships properties: [{\'properties\': [], \'label\': \'ActedIn\'}]\\nRelationships: [\'(:Person)-[:ActedIn]->(:Movie)\']\\n\\nNote: Do not include any explanations or apologies in your responses.\\nDo not respond to any questions that might ask anything else than for you to construct a Cypher statement.\\nDo not include any text except the generated Cypher statement.\\n\\nThe question is:\\nWho played in The Godfather: Part II?\\n```\\n\\nIndeed, if you copy this prompt and paste it in [chatGPT\'s browser interface](https://chat.openai.com/), \\nyou will get the same or very similar Cypher query. The important point is: that\'s all\\nthe coding you have to do to build a natural language interface that can query your database. \\nYou ultimately construct a string prompt that contains $Q_{NL}$, some\\ninstructions, and schema of the database, and the LLM will generate a query for you. \\nThe `KuzuGraph` and `KuzuQAChain` are simple wrappers to do just that.\\nIf you want to play around with how well this works on other datasets,\\nwe have this pipeline implemented in K\xf9zu\'s browser frontend [K\xf9zuExplorer](https://kuzudb.com/docusaurus/kuzuexplorer/). \\n\\nThat is, for any database you have in K\xf9zu, you get a natural language interface over it in\\nK\xf9zuExplorer (just click the \\"robot icon\\" on the left panel). \\nYou can develop similar pipelines with other GDBMSs using similar interfaces (*though I recommend using K\xf9zu as it will be the\\nsimplest to get started* \ud83d\ude09: *Unlike other GDBMSs, K\xf9zu is embeddable and requires no server set up*).\\nIf you instead want to build Q&A systems over your RDBMSs, you can use\\nLangChain\'s [SQLDatabaseChain](https://python.langchain.com/docs/use_cases/qa_structured/sql#case-2-text-to-sql-query-and-execution) and \\n[SQLAgent](https://python.langchain.com/docs/use_cases/qa_structured/sql#case-3-sql-agents) or\\nLlamaIndex\'s [NLSQLTableQueryEngine](https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/SQLIndexDemo.html#part-1-text-to-sql-query-engine). The level of simplicity is similar to the example I presented. In practice, it is unlikely that your chatbot or search engine will be as simple\\nas the above example where the application interacts with the LLM only once. If you want\\nto interact with the LLM multiple times and conditionally take one action over another action etc.,\\nLangChain and LlamaIndex also provide ways to do that through their \\"Agents\\" (see [LangChain Agents](https://python.langchain.com/docs/modules/agents/) and [Llama Index Agents](https://docs.llamaindex.ai/en/stable/use_cases/agents.html)).\\n\\n\\n### How Good Are LLMs in Generating High-Level Queries?\\nAlthough building a text-to-high-level-query-language pipeline is now very simple with LLMs,\\nsimplicity **does not** mean quality. Indeed, people building these systems are now faced with the following two important questions: \\n\\n1. *How accurate are the high-level queries that LLMs generate?*\\n2. *How, e.g., through what types of prompts or data modeling, can we increase the accuracy of the\\nqueries generated by LLMs?*\\n\\nHere are several papers on this that I suggest reading:\\n1. *[A comprehensive evaluation of ChatGPT\u2019s zero-shot Text-to-SQL capability](https://arxiv.org/pdf/2303.13547.pdf)* from Tsinghua University and University of Illinois at Chicago. \\n2. *[Evaluating the Text-to-SQL Capabilities of Large Language Models](https://arxiv.org/pdf/2204.00498.pdf)* from researchers from Cambridge and universities and institutes from Montr\xe9al.\\n3. *[Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation](https://arxiv.org/pdf/2308.15363.pdf)* from Alibaba Group.\\n4. *[Enhancing Few-shot Text-to-SQL Capabilities of Large Language Models: A Study on Prompt Design Strategies](https://arxiv.org/pdf/2305.12586.pdf)* from Yale, Columbia, and Allen Institute for AI.\\n5. *[How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, and Cross-domain Settings](https://arxiv.org/pdf/2305.11853.pdf)* from Ohio State\\n6. *[A Benchmark to Understand the Role of Knowledge Graphs on LLM\'s Accuracy for Q&A on Enterprise SQL Databases](https://arxiv.org/pdf/2311.07509.pdf)* from data.world.\\n\\nThese papers are either entirely or *almost* entirely evaluation-only papers that experiment with very detailed approaches of prompting LLMs\\nto generate SQL queries. First, let me say that the general message these\\npapers give (maybe except the last one) is that LLMs are pretty good. With right prompting (or even with basic prompting)\\nthey do very well on these benchmarks. I see accuracy rates over 85% on the Spider benchmark in several papers. These are clearly\\nbetter numbers than what pre-LLM state-of-the-art systems achieved. This should be impressive to many.\\n\\nSecond, the set of techniques are too detailed to cover here but some example heuristics \\nthese papers experiment with include the following: (i) the syntax used for providing the schema \\n(apparently putting \\"the pound sign `#` to differentiate prompt from response in examples yields impressive performance gains\\" \ud83d\ude00 go figure); (ii)\\nthe number and selection of example (question, SQL) pairs, e.g., apparently there is a sweet spot in the number\\nof examples to provide; or (iii) the effects of standardizing the text in the prompt, e.g., indenting and using all lower case letters consistently\\n(apparently has minor but some effect). Yes, as interesting and important it is to learn how to use LLMs better, I still \\ncan\'t escape the following thought before going to bed: somewhere out there, some advisor might be torturing some graduate student\\nto check if the magical box produces better SQL with a pound sign vs double slashes!\\n\\nMost work I found is on generating SQL.\\nIn contrast, I found no papers that do similar prompting study for query languages\\nof GDBMS though I ran into two papers that are providing benchmarks for query languages of GDBMSs: \\n(i) [SPARQL](https://arxiv.org/abs/2309.16248); and (ii) [Cypher](https://dl.acm.org/doi/pdf/10.1145/3511808.3557703)).\\nSo a low-hanging fruit future work is the following:\\n\\n*Important Future Work 1: Similar prompting studies for query languages of graph DBMSs with a focus on recursive and unions of joins queries.*: \\nIn contrast to SQL queries, here, one should study various recursive queries that the query languages of GDBMSs are particularly good\\nat and union-of-join queries which are asked by omitting labels in the query languages of GDBMSs. \\nFor example if you want to ask all connections between\\nyour `User` nodes and User can have many relationships, such as `Follows`, `SentMoneyTo`, or `SameFamily`,\\nyou would have to write 3 possible join queries in SQL and union them. Instead, you can write this query\\nwith a very simple syntax in Cypher as \\n`MATCH (a:User)-[e]->(b:User)`, where the omissions of the label on the relationship `e` indicates searching over\\nall possible joins.[^1] \\n\\n[^1]: SPARQL syntax is different but a similar advantage exists by omitting type constraints.\\n\\nAs a side note: In the context of any query language, including SQL, questions that require sub-queries are of particular \\ninterest as they are generally harder to write. Some of the papers I read had sections analyzing the performance of\\nLLMs on nested queries but the focus was not on these. In prior literature there are papers written solely on text-to-SQL generation for\\nnested queries (e.g., see [the ATHENA++ paper](https://www.vldb.org/pvldb/vol13/p2747-sen.pdf)). I am certain someone\\nsomewhere is already focusing solely on nested queries and that\'s a good idea.\\n\\n## data.world Paper and Some Interesting Questions\\nIn the remainder of the post I want to review [the benchmark paper](https://arxiv.org/pdf/2311.07509.pdf) from `data.world` that focuses on text-to-SQL using LLMs. Unlike other papers out there that \\nstudy the effects of different prompting heuristics, this paper studies the *effects of data modeling \\non the accuracy of SQL queries generated by LLMs*, which is closely related to GDBMSs. \\n\\nSpecifically, this paper is an evaluation of the performance of GPT-4 in generating SQL using no examples, i.e., zero-shot,\\nwith basic prompting over a standardized insurance database schema \\ncalled The [OMG Property and Casualty Data Model](https://www.omg.org/spec/PC/1.0/About-PC). \\nSee Figure 1 in the paper (omitted here) for the conceptual schema, which consists of classes such as \\nPolicy, Account, Claims, Insurable Object, among others, and their relationships.\\nThe paper has a benchmark of 43 natural language questions and compares 2 approaches to generate the SQL query.\\nThe below figure shows an overview of these approaches for reference:\\n\\n
\\n\\n
\\n\\n1. Direct SQL Generation: In this approach, $Q_{NL}$ and the relational schema of the OMG database is given\\n to GPT-4. The schema is given in terms of `CREATE TABLE` statements, such as:\\n ```sql\\n CREATE TABLE Claim(\\n Claim_Identifier int NOT NULL,\\n Catastrophe_Identifier int NULL,\\n ...\\n Claim_Open_Date datetime NULL ,\\n\\t ...\\n\\t PRIMARY KEY (Claim_Identifier ASC),\\n\\t FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier),\\n ...)\\n ```\\n The full schema statements can be found [here](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl).\\n GPT-4 is asked to generate a SQL query $Q_{SQL}$ to answer $Q_{NL}$.\\n Copy-pasted from the paper, these prompts look as follows:\\n ```\\n Given the database described by the following DDL:\\n \\n Write a SQL query that answers the following question. Do not explain the query. return just the query, so it can be run\\n verbatim from your response.\\n Here\u2019s the question:\\n \\n ```\\n2. Indirect SQL Generation via Graph Modeling/SPARQL: In this approach, instead of the relational schema of the database, the same\\n database is modeled as an *[OWL ontology](https://www.w3.org/OWL/)* (OWL is short for Web Ontology Language).\\n Ontology is another term for schema when modeling data as graph as classes and relationships between them. OWL is a W3C standard\\n and part of the RDF technology stack so OWL ontologies are expressed as a set RDF triples, such as:\\n ```\\n ...\\n in:Claim rdf:type owl:Class ;\\n rdfs:isDefinedBy ;\\n rdfs:label \\"Claim\\" .\\n in:claimOpenDate rdf:type owl:DatatypeProperty ;\\n rdfs:domain in:Claim ;\\n rdfs:range xsd:dateTime ;\\n rdfs:isDefinedBy ;\\n rdfs:label \\"Claim Open Date\\" .\\n in:hasCatastrophe rdf:type owl:ObjectProperty ;\\n rdfs:domain in:Claim ;\\n rdfs:range in:Catastrophe ;\\n rdfs:isDefinedBy ;\\n rdfs:label \\"has catastrophe\\" .\\n ...\\n ```\\n The full ontology can be found [here](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl).\\n GPT-4 is then asked to generate a SPARQL query $Q_{SPARQL}$, instead of SQL, for the same $Q_{NL}$. The full prompt, again copy-pasted\\n from the paper with some simplifications, looks like this:\\n ```\\n Given the OWL model described in the following TTL file:\\n \\n Write a SPARQL query that answers the question. Do not explain the query. return just the query, so it can be run verbatim from your response.\\n Here\u2019s the question:\\n \\n ```\\n As a last step, the authors have a direct mapping from $Q_{SPARQL}$ to a SQL query $Q_{SQL}$. This is a quite straigh-forward step\\n as the modeling as an ontology vs relational schema have direct translations from classes and properties to tables and columns.\\n\\nAn interesting comparison. There is some intuition for why one would be interested in the effectiveness of\\nquery generation through an ontology because one of the well-known \\npre-LLM text-to-SQL papers [ATHENA](https://www.vldb.org/pvldb/vol9/p1209-saha.pdf) did something similar.\\nInstead of SPARQL they had another query language over an ontology called Ontology Query Language, which\\nwas then mapped to SQL. \\n\\nThe results are even more interesting. The authors categorize their 43 questions into\\n4 quadrants based on 2 dimensions: \\n- Low vs high **question** complexity: Questions that require only simple projections\\nare low complexity. Those that require aggregations or math functions are high complexity.\\n- Low vs high **schema** complexity: Questions whose SQL queries require up to 4 tables are low schema complexity. Those that\\n require 5 or more joins are high schema complexity. \\n\\nThe accuracy results are shown below. Accuracy here is \\"execution accuracy\\" meaning that only the answers of the queries\\nare checked against the ground truth answer. That is, even if the SQL query GPT-4 generated was actually not correct \\nbut by luck it computed the correct answers the paper takes it as correct (apparently happens very rarely in this study).\\n\\n| Overall: 16.7% vs 54.2%| Low Schema Complexity | High Schema Complexity |\\n| -------- | -------- | -------- |\\n| Low Question Complexity | 37.4% vs 66.9% | 0% vs 38.7% |\\n| High Question Complexity | 25.5% vs 71.1% | 0% vs 35.7% |\\n\\nOverall, the indirect SQL generation method through SPARQL is much more effective in this zero-shot setting.\\nNot surprisingly, questions that require 5 or more joins are harder regardless of the \\nmethod used and direct SQL cannot get any of those questions right. These are interesting\\nresults for an initial study on the effects of data modeling on LLMs\' accuracy on generating database queries. \\nThese results should give many researchers and practitioners ideas about how to replicate\\nand validate/invalidate similar results under different settings, e.g., with few-shot\\nexamples and under different databases.\\n\\n**That said, one should ask, why?** In fact, we should all be suspicious that merely modeling the\\nsame set of records with a different abstraction should have any visible effects. After all, by modeling\\nthe same records differently, one does not obtain or lose information. So if and when LLMs are smart enough,\\nthey shouldn\'t care how the data was modeled. But for now, if a pound sign can make a difference,\\nwe should not be surprised modeling choices can have large impacts. As such, it is healthy to be suspicious\\nand ask why. These motivate a few important questions I think are worth studying. My premise\\nis that somehow if the differences are this large, it must be that the task for GPT-4 got simpler when\\nasked to generate a SPARQL query. I can hypothesize about a few possible reasons for this: \\n- *Some queries require fewer tokens to write in SPARQL*: One difference the query languages\\n of GDBMSs often have is that certain equality conditions are implicit in the syntax, which\\n means their `WHERE` clauses are simpler for some queries. For example if you wanted to return\\n the names of the Catastrophe that Claim with ID Claim1 has, in SPARQL you can write it as:\\n ```\\n SELECT ?name\\n WHERE { in:hasCatastrophe ?catastrophe,\\n ?catastophe in:catastropheName ?name}\\n ``` \\n In SQL you would write:\\n ```\\n SELECT Catastrophe_Name\\n FROM Claim, Catastrophe\\n WHERE Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier AND\\n Claim.Claim_Identifier = Claim1\\n ```\\n Note that the `Claim.Claim_Identifier = Claim1` equality condition is implicit in the ` in:hasCatastrophe ?catastrophe` triple\\n and the `Claim.Catastrophe_Identifier = Catastrophe.Catastrophe_Identifier` condition is implicit in the fact that `?catastrophe` appears\\n both in the first and second triples in the SPARQL query. Such implicit equality conditions are common in the languages of\\n graph query languages especially when expressing joins. For example in Cypher you can omit all join conditions in WHERE clauses as long\\n as those joins have been pre-defined to the system as relationships. Instead you join records through the `(a)-[e]->(b)` syntax.\\n It\'s unclear how much this could matter but it is an immediate advantage of SPARQL that can explain why complex join queries are easier to generate\\n in SPARQL than SQL. \\n\\n **Side note**: On the flip side, SPARQL can be more verbose in projections. For example, if you wanted to return the number, open and close\\n dates of every claim, you\'d write the following SQL query:\\n ```\\n SELECT Claim_Number, Claim_Open_Date, Claim_Close_Date\\n FROM Claim\\n ```\\n In SPARQL, you\'d have to write both the names of the property you want to project and give it an additional variable as follows:\\n ```\\n SELECT ?number, ?open_date, ?close_date\\n WHERE { ?claim in:claimNumber ?number,\\n ?claim in:claimOpenDate ?open_date,\\n ?claim in:claimCloseDate ?close_date\\n ```\\n2. *Graph modeling gives explicit names to foreign keys:* There is a reason that database courses teach database modeling to students\\n using graph-based models, such as Entity-Relationship or UML models. First, humans think of the world\\n as objects/entities and their relationships. In some sense, these are higher-level models where relationships\\n between objects are denoted explicitly with explicit names (instead of as less explicit foreign key constraints).\\n For example, the implicit connection between Claims and\\n Catastrophes through the `FOREIGN KEY (Catastrophe_Identifier) REFERENCES Catastrophe(Catastrophe_Identifier)`\\n constraint was given an explicit English name: `hasCatastrophe` in the ontology. This explicitness may make\\n it easier for LLMs to understand the schema and generate SPARQL queries.\\n\\nBoth of these are qualitative hypotheses. however, there is a more immediate\\nreason the authors of this paper may have obtained such major differences between the two approaches they tried.\\nIntentionally or unintentionally, their ontology is simplified significantly compared to the relational schema they have.\\nFor example, the Claim relation has `Claim_Reopen_Date` and `Claim_Status_Code` properties which are removed from the ontology.\\nMany such properties from the relations seem to have been removed, and the ontology overall looks simpler.\\nThere are also several differences between the ontology and the relational schema that are confusing. For example\\nthe [ontology](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/ontology/insurance.ttl) \\nhas a class `Agent` and `Policy` objects are `in:soldByAgent` by some Agent objects (see lines 20 and 92). I cannot\\nsee corresponding relations or columns in the [relational schema](https://github.com/datadotworld/cwd-benchmark-data/blob/main/ACME_Insurance/DDL/ACME_small.ddl). Unless I am missing something about how the prompts were given, \\nthese are also likely to have important effects on the results and someone should fix and obtain new results\\nin a more fair comparison.\\n\\nLet me next raise several high-level questions that I think are important:\\n\\n*Important Future Work 2: Rules of thumbs in data modeling to make LLM-generated queries more accurate.* \\nI think the higher-level question of studying the effects of data modeling in more depth is a very good direction. \\nAs LLMs get smarter, I would expect that the presence/absence of a pound sign or the style of English \\nshould matter less. These look more like syntactic differences that can be automatically detected over time. \\nModeling choices are more fundamental and relate to the clarity and understandibility of the records that will be queried by the LLM. \\nSo identifying some rules of thumb here looks like the promising path forward. Let me list a few immediate questions one can study:\\n\\n*Important Future Work 2.1: Effects of normalization/denormalization.* If the shortcoming of GPT-4 is \\ngenerating queries with many joins, one way to solve this is to denormalize the relations into fewer\\ntables and study its effects. Again, I\'m thinking of same records just modeled differently with fewer\\ntables. What happens if we reduce all data into a single table with dozens of columns and many value repetitions? \\nNow all possible joins would have been performed so we\'d force the LLM to write a join-less query with\\nfilters, distincts, and aggregations. What happens if we normalize the tables step-by-step until we \\nget to a well known form, such as [Boyce-Codd Normal Form](https://en.wikipedia.org/wiki/Boyce%E2%80%93Codd_normal_form)? Do we consistently get better or worse accuracy?\\n\\n*Important Future Work 2.2: Use of views.* In relational modeling, views are an effective way to have higher \\nand simpler modeling of your records. Similar to a $Q_{NL}$ -[LLM]-> $Q_{SPARQL}$ -[Direct Mapping]-> $Q_{SQL}$ pipeline,\\none can test the effectiveness of $Q_{NL}$ -[LLM]-> $Q_{SQL-over-Views}$ -[Direct Mapping]-> $Q_{SQL}$ pipeline.\\n\\n*Important Future Work 3: Use of Cypher as intermediate query language to translate to SQL.* One reason to experiment with Cypher \\nin addition to SPARQL is that Cypher is, arguably, more similar to SQL than SPARQL but has the advantage that (common) join\\nconditions are implicit in the `(a)-[e]->(b)` node-arrow syntax. Yet Cypher does not have the verbosity of the SPARQL projections \\nI mentioned above (so you project properties the same way you project columns in SQL). In my world, all high-level query languages\\nlook very similar to SQL, so eventually when LLMs are smart enough, or even today, I think these language differences\\nshould have minor effects. However, graph query languages will likely continue to have major advantages when writing\\nrecursive queries, as they have specialized syntax (e.g., Cypher has the Kleene star syntax) to do so. For those queries,\\nexpressing first in Cypher and then mapping to SQL could lead to an advantage. \\n\\n## Final Words\\nNeedless to say, in the next few years, the field will be flooded with work on how to \\nuse LLMs to solve the text-to-high-level-query problem. Many rules of thumb will emerge\\nabout how to prompt them correctly. The questions one can ask in this space is endless.\\nI can speculate about it a lot, but I think it\'s plausible that \\nmany of these rules of thumb, specifically the syntactic\\ndifferences in prompting, can become\\nobsolete very quickly as newer and more advanced LLMs that are better at speaking high-level database languages emerge.\\nFor example, it\'s plausible that people will stop showing LLMs example (question, query) pairs each time they ask them to generate\\nSQL once LLMs are better at speaking SQL.\\n\\nHowever, the harder question of how to model the data so that its meaning is clear, and the\\nqueries that need to be written, are simpler, is more likely to remain a challenge for a longer time. I would not be too optimistic\\nthat there can emerge very clear answers to this question. How to model your data is part-art and part-science. \\nYet, some studiable questions, such as the effects of normalization, use of views or generating Cypher for recursive queries,\\ncan yield some important best practices that can be useful to developers building these systems.\\n\\nIn the next post, I will cover what I learned about RAG over unstructured data. Graphs and knowledge graphs are playing\\na more interesting role in that space. Until then, happy new year to all!"},{"id":"kuzu-0.1.0-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.1.0-release","source":"@site/blog/2023-11-19-kuzu-v-0.1.0.md","title":"K\xf9zu 0.1.0 Release","description":"We are very happy to release K\xf9zu 0.1.0 today! This is a major release with the following set of new features and improvements:","date":"2023-11-19T00:00:00.000Z","formattedDate":"November 19, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":9.075,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.1.0-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"RAG Using Structured Data: Overview & Important Questions","permalink":"/docusaurus/blog/llms-graphs-part-1"},"nextItem":{"title":"K\xf9zu 0.0.12 Release","permalink":"/docusaurus/blog/kuzu-0.0.12-release"}},"content":"We are very happy to release K\xf9zu 0.1.0 today! This is a major release with the following set of new features and improvements:\\n\\n\x3c!--truncate--\x3e\\n\\n## NodeGroup-Based Storage\\n\\nWith this release, we have completed the major features of our NodeGroup-base storage design,\\nwhich was outlined in this [issue](https://github.com/kuzudb/kuzu/issues/1474). The primary goal of this design was to have a\\nstorage design that is conducive to implementing compression and zone maps optimization.\\nConceptually, a NodeGroup is equivalent to a [Parquet RowGroup](https://parquet.apache.org/docs/concepts/), which\\nrepresents a horizontal partition of a table consisting of k many nodes (k=64x2048 for now). Each k nodes\' data are\\nmanaged and compressed as a unit on disk files. In release v0.0.7, we had completed the first part of this design and changed our\\nnode table storage to use NodeGroups. In this release, we have completed the second part of this design and now relationship\\ntables are also stored as NodeGroups. That means we now compress the relationships of k many nodes together.\\n\\nWe also stores all column data in a single file `data.kz` which has significantly reduced the number of database files we now maintain.\\n\\n### String Compression\\n\\nWe have extended our compression to compress strings in the database using dictionary compression.\\nFor each string \\"column chunk\\" (which is a partition of an entire column in a table\\nstoring one NodeGroup\'s values), each string s is\\nstored once in a dictionary, and for each record that has value s, we store a pointer to s.\\nThis design applies when storing string properties on relationship tables.\\nThis is done by using 3 column chunks in total. 2 column chunks store the dictionary as follows. One \\"raw strings\\" column chunk\\nstores all the unique strings in the column chunk one after another, and another \\"offsets column chunk\\" identifies\\nthe beginning indices of each string. Then, one additional \\"index column chunk\\" stores the pointers to the strings\\nas indices to the \\"offsets\\" column to identify the strings.\\nThe offset and index columns are bitpacked in the manner of integer columns.\\n\\n**String Compression Benchmark**\\n\\nHere is a micro-benchmark using the Comment table in LDBC100. To compare the compression rate of each column individually,\\nwe construct a new table Tx for each string column x in the Comment table, e.g., `Browser Used`. Tx consists of the\\ncolumn x and a serial primary key, which allows us to avoid storing any materialized hash index. We report the size of the data.kz file\\nand compare against a previous version v0.0.10 of K\xf9zu.\\n\\n| Column | Version 0.0.10 | Version 0.1.0 | Difference |\\n|---------------|----------------|----------------|------------------|\\n| Browser Used | 4.2 GB | 272 MB | -93.5% |\\n| Content | 9.7 GB | 7.5 GB | -22.7% |\\n| Location IP | 5 GB | 1.6 GB | -68.0% |\\n\\nWe also report the entire LDBC100 database size, including all database files (data.kz, indices, metadata, catalog), of v0.1.0\\nand a slightly older version v0.0.8, which included compression of nodes. So this experiment reports\\nboth improvements that come from storing relationship tables in compressed form as well as\\nstoring strings of both node and relationship tables in compressed form.\\n\\n| Database | Version 0.0.8 | Version 0.1.0 | Difference |\\n|----------|----------------|--------------|----------------|\\n| LDBC100 | 127 GB | 94 GB | -26.0% |\\n\\n\\n### Data Ingestion Improvements\\nMoving our relationship table storage to a NodeGroup-based one also improved our\\ndata ingestion times. The following benchmark reports the loading time of the LDBC100 `likesComment.csv` relationship records.\\nThe file contains 242M records and takes 13 GB in raw CSV format. Below we compare v0.1.0 against v0.0.10 using a machine with\\n2 Intel Xeon Platinum 8175M CPUs, each of which has 48 physical CPU cores. We used 300 GB of the 380GB total RAM during this test.\\n\\n| | Version 0.0.10 | Version 0.1.0 | Difference |\\n|---------|----------------|----------------| ----------------|\\n| 8 threads | 266.8 s | 229.8 s | -13.9% |\\n| 4 threads | 312.5 s | 246.8 s | -21.0%\\n| 2 threads | 446.7 s | 335.6 s | -24.8%\\n| 1 threads | 700.8 s | 581.9 s | -17.0%\\n\\n\\n## New Features\\n\\n### Direct Scans of DataFrames\\nWe now support scanning Pandas DataFrames directly. Consider the following `person` DataFrame\\nthat contains two columns, `id` and `height_in_cm` (only the latter will be used in the example):\\n\\n```\\nid = np.array([0, 2, 3, 5, 7, 11, 13], dtype=np.int64)\\nheight_in_cm = np.array([167, 172, 183, 199, 149, 154, 165], dtype=np.uint32)\\nperson = pd.DataFrame({\'id\': id, \'height\': height_in_cm})\\n```\\nThe query below finds all students who are taller than the average height of the records in the `person` DataFrame:\\n```\\nquery = \'CALL READ_PANDAS(\\"person\\")\\n WITH avg(height / 2.54) as height_in_inch\\n MATCH (s:student)\\n WHERE s.height > height_in_inch\\n RETURN s\'\\nresults = conn.execute(query)\\n```\\n\\nDetails of this feature can be found [here](/cypher/query-clauses/call#read_pandas).\\n\\n### Copy\\nThis release comes with several new features related to Cypher\'s `COPY` clause.\\n\\n#### Copy To Parquet Files\\nQuery results can now be exported to Parquet files.\\n```\\nCOPY ( MATCH (a:Person) RETURN a.* ) TO \\"person.parquet\\";\\n```\\n\\n#### Copy To CSV Files\\nWe added serveral configuration options when exporting to CSV files.\\n```\\nCOPY ( MATCH (a:Person) RETURN a.* ) TO \\"person.csv\\" (delim = \'|\', header=true);\\n```\\n\\nWe also improved the performance of the CSV writer. Below is a micro benchmark of exporting the LDBC100 Comment table to CSV format.\\n```\\nCOPY (MATCH (p:Comment) RETURN p.*) to \u2018comment.csv\u2019;\\n```\\n\\n| | Version 0.0.10 | Version 0.1.0 |\\n|-------------|-----------|-----------|\\n| Runtime | 1239.3s | 104.56s |\\n\\n\\n#### Optional `column_names` Argument in Copy From Statements\\nUsers can now load data to a subset of the columns in a table. Previously, we required that if\\nusers are going to load an empty table T from a file F,\\ne.g., a CSV or Parquet file, then F must contain: (1) as many columns as the columns in T; and (2) in the same order as\\ntable T. Now users can optionally add a `column_names` argument in `COPY FROM` statements,\\nwhich relaxes both of these restrictions: (1) F can now contain a subset of the columns; and (2) in arbitrary\\norder, which needs to be specified in the `column_names` argument. Here is an example:\\n```\\nCREATE NODE TABLE Person (id INT64, name STRING, comment STRING, PRIMARY KEY(id));\\nCOPY Person (name, id) FROM \\"person.csv\\";\\n```\\nThe code above first creates a `Person` table with 3 columns, and then loads two of its columns from a file\\nthat contains `name` and `id` values of the columns respectively.\\nThe third `comment` column in the table will be set to `NULL` for all imported records. The details\\nof this feature can be found [here](/cypher/copy).\\n\\n### Updates\\n\\n#### Detach Delete\\n\\nK\xf9zu now supports Cypher\'s [DETACH DELETE](/cypher/data-manipulation-clauses/delete#detach-delete) clause,\\nwhich deletes a node and all of its relationships together.\\nPreviously users could only use the `DELETE` command, which deleted nodes that had no relationships.\\nFor example, the following query deletes a `User` node with `name` Adam and all of its edges.\\n```\\nMATCH (u:User) WHERE u.name = \'Adam\' DETACH DELETE u;\\n```\\n\\n#### Return Deleted Rows\\n\\n`RETURN` clauses can now return variable bindings that were used in the `DELETE` command. For example,\\nyou can return nodes that were deleted in the previous DELETE statement as follows:\\n```\\nDELETE (a:Person) RETURN a;\\n```\\n\\nDetails of this feature can be found [here](/cypher/data-manipulation-clauses/read-after-update).\\n\\n### Other Changes\\n\\n#### SQL-style Cast Function\\n\\nWe have implemented a SQL-style `cast` function `cast(input, target_type)` to cast values between different\\ntypes. The cast function will convert the `input` argument to the `target_type` if\\ncasting of the input value to the target type is defined. For example:\\n```\\nRETURN cast(\\"[1,2,3]\\", \\"INT[]\\");\\n--------------------------\\n| CAST([1,2,3], INT32[]) |\\n--------------------------\\n| [1,2,3] |\\n--------------------------\\n```\\nAlong with this, we are deprecating our previous way of doing casts with separate functions, e.g., `STRING(1.2)` or `to_int64(\\"32\\")`.\\nDetails of the `cast` function can be found [here](/cypher/expressions/casting).\\n\\n#### Recursive Relationship Node Filter\\n\\nSince v0.0.5 we have supported filtering the intermediate relationships that can bind to\\nrecursive relationships, based on the properties of these intermediate relationships.\\nWith the current release, we now support filtering the intermediate nodes that are bound to recursive relationships.\\nAs we did for filtering intermediate relationships, we adopt Memgraph\'s syntax for this feature as follows:\\n```\\nMATCH p = (a:User)-[:Follows*1..2 (r, n | WHERE n.age > 21)]->(b:User)\\nRETURN p;\\n```\\nThe first variable `r` that is inside the recursive relationship above binds to the intermediate relationships while\\nthe second variable `n` binds to the intermediate nodes. The `|`symbol can be followed with a `WHERE` clause\\nwhere these variables can be used to express a filtering expression. This query finds all 1 to 2-hop paths between\\ntwo `User` nodes where the intermediate nodes of these paths have `age` properties greater than 21.\\nDetails of this feature can be found [here](/cypher/query-clauses/match#filter-variable-length-relationships).\\n\\n#### Count Subquery\\n\\nWe have added support for counting subqueries, which checks the number of matches for the given pattern in the graph.\\nThe output of this counting can be bound to a variable with aliasing. For example, the following query counts the\\nnumber of followers of each user in the graph.\\n```\\nMATCH (a:User)\\nRETURN a.name, COUNT { MATCH (a)<-[:Follows]-(b:User) } AS num_follower\\nORDER BY num_follower;\\n```\\nThe details of count subqueries can be found [here](/cypher/subquery#count-subquery).\\n\\n\\n#### New INT128 Data Type\\n\\nFinally, we now have support for 16-byte signed huge integers.\\n\\n## Development\\n\\n### Nightly Build\\nWe have setup a nightly build pipeline for K\xf9zu users who want to access our latest feature set.\\nHere is how you can use the latest nightly version of K\xf9zu:\\n\\n- For the Python API, the latest nightly version can be installed with `pip install --pre kuzu`.\\n- For the Node.js API, the latest nightly version can be installed with `npm i kuzu@next`.\\n- For the Rust API, the latest nightly version can be found at [crates.io](https://crates.io/crates/kuzu/versions).\\n- For the CLI, C and C++ shared library, and the Java JAR, the latest nightly version can be downloaded from the latest run of [this GitHub Actions pipeline](https://github.com/kuzudb/kuzu/actions/workflows/build-and-deploy.yml).\\n\\n### Reduced Binary Size\\nWith this release, we removed our Apache Arrow dependency, which significantly reduces oure binary size.\\nAdditionally, we now strip the shared library and CLI binaries of the symbols that are not needed by our\\nclient APIs. This further reduces our binary sizes.\\nFor example, on a MacOS arm64 platform, these two improvements achieve the following cumulative binary size reductions:\\n\\n| | Version 0.0.10 | Version 0.1.0 |\\n|-------------|-----------|-----------|\\n| Binary Size | 27.2 MB | 10.3 MB |\\n\\nStripping of our other libraries (e.g. Python) is a work in progress.\\n\\n## Closing Remarks\\nAs usual, we would like to thank everyone in the K\xf9zu engineering team, especially our interns, for making this release possible.\\nWe look forward to your feedback!\\n\\nEnjoy K\xf9zu v 0.1.0 and the upcoming holiday season, which in this part of the world \ud83c\udde8\ud83c\udde6\ud83c\udde8\ud83c\udde6 coincides with\\ncoming of the cold and cozy winter \ud83e\udd17\ud83e\udd17."},{"id":"kuzu-0.0.12-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.12-release","source":"@site/blog/2023-10-31-kuzu-v-0.0.12.md","title":"K\xf9zu 0.0.12 Release","description":"We release K\xf9zu 0.0.12, another minor release. This release fixes a bug that prevents the database to be opened in read-only mode on a read-only file system. It also adds support for INT128 data type.","date":"2023-10-31T00:00:00.000Z","formattedDate":"October 31, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.24,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.12-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.1.0 Release","permalink":"/docusaurus/blog/kuzu-0.1.0-release"},"nextItem":{"title":"K\xf9zuExplorer: Visualizing Query Results and Schemas","permalink":"/docusaurus/blog/kuzuexplorer"}},"content":"We release K\xf9zu 0.0.12, another minor release. This release fixes a bug that prevents the database to be opened in read-only mode on a read-only file system. It also adds support for INT128 data type.\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.12)."},{"id":"kuzuexplorer","metadata":{"permalink":"/docusaurus/blog/kuzuexplorer","source":"@site/blog/2023-10-25-kuzuexplorer/index.md","title":"K\xf9zuExplorer: Visualizing Query Results and Schemas","description":"Today, we are happy to release K\xf9zuExplorer, which is K\xf9zu\'s browser-based frontend to","date":"2023-10-25T00:00:00.000Z","formattedDate":"October 25, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":3.445,"hasTruncateMarker":true,"authors":[{"name":"Chang Liu","url":"https://www.linkedin.com/in/mewim/","imageURL":"https://kuzudb.com/img/blog/chang.gif","key":"chang"}],"frontMatter":{"slug":"kuzuexplorer","authors":["chang"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.12 Release","permalink":"/docusaurus/blog/kuzu-0.0.12-release"},"nextItem":{"title":"K\xf9zu 0.0.11 Release","permalink":"/docusaurus/blog/kuzu-0.0.11-release"}},"content":"import DatasetsImage from \'./preexisting-datasets.png\';\\nimport SchemaPanelImage from \'./schema-panel.png\';\\nimport ShellPanelImage from \'./query-result-node-link-view.png\';\\n\\n\\nToday, we are happy to release K\xf9zuExplorer, which is K\xf9zu\'s browser-based frontend to\\nvisualize and explore database schemas and query results in the form of a graph, table, or in JSON.\\nThis is a very useful tool for exploring databases and debugging applications during prototyping\\nphase. This post describes a brief overview of the main features of K\xf9zuExplorer with pointers to\\n[K\xf9zuExplorer documentation](/kuzuexplorer) for details.\\n\\n\x3c!--truncate--\x3e\\n\\n## Launching K\xf9zuExplorer\\n\\nK\xf9zuExplorer is a web application that is launched from a deployed Docker image. Assuming you have Docker\\ninstalled before proceeding, you can launch K\xf9zuExplorer on an existing DBMS you have or on an empty database.\\nDetails about how to launch K\xf9zuExplorer can be found [here](/kuzuexplorer/#launching-k\xf9zuexplorer).\\nFor example, to start K\xf9zuExplorer on an empty\\ndatabase, you can simply run the following command on your shell, and then access K\xf9zuExplorer by going to\\n`http://localhost:8000`\\n\\n```\\ndocker run -p 8000:8000 --rm kuzudb/kuzu-ui:latest\\n```\\n\\nK\xf9zuExplorer comes bundled with several pre-existing databases, one of which you can use to get started.\\nTo load one of these databases, click the `Datasets` tab on the top right corner on your landing page\\nand then the `Load Dataset` button as shown in the below figure.\\n\\n
\\n\\n
\\n\\n## Schema Panel: Schema Exploring and Editing\\n\\nOne of the two main functionalities of K\xf9zuExplorer is to explore and modify the schema of your database.\\nBy clicking the `Schema` tab on the top right corner, you\'ll get to a page that shows you the\\nNode and Relationship tables in your database in a node-link view on the left. Using the right panel,\\nyou can do several things to explore and modify your tables, such as by adding new properties to your\\nnode/rel tables, inserting new node/rel tables, or dropping node/rel tables. These changes can all be done\\ninteractively by clicking buttons, which automatically generate and run the corresponding Cypher queries\\n(unless you have launched K\xf9zuExplorer [in read-only mode](/kuzuexplorer/#access-mode)).\\n\\n
\\n\\n
\\n\\nMore details\\nabout what can be done in the Schema panel can be found [here](/kuzuexplorer/schema-panel).\\n\\n## Shell Panel: Query Result Visualization\\n\\nUsing K\xf9zuExplorer, you can also issue Cypher queries similar to K\xf9zu\'s\\n[command line interface](/installation#command-line), and\\nvisualize the results of these queries.\\nTo issue queries go to the `Shell` tab on the right corner and you can type a Cypher query.\\nAs you type your query, K\xf9zuExplorer shell will suggest keyword completions, which can\\nhelp you write your queries. You can then click the green \\"play\\" icon on the left hand\\nside of the shell panel, which will execute your queries and display the results. The\\nresults can be displayed in three different modes: (i) a node-link graph view; (ii) a table; or (iii) as json.\\nAs an example, the below image presents the results of the following query which retrieves all nodes and edges\\nin the database in a node-link graph view:\\n\\n```\\nMATCH (a)-[e]->(b)\\nRETURN *\\n```\\n\\n
\\n\\n
\\n\\nYou can inspect individual nodes and edge in the query results by clicking on them. More details\\nabout what can be done in the Shell panel can be found [here](/kuzuexplorer/shell-panel).\\n\\n## Settings Panel: Configuring Visualizations\\n\\nThere is also a Settings tab on the right hand corner, which can be used for several more advanced\\nsetting changes, e.g., changing the colors or sizes of nodes of a certain type (e.g., `User` nodes) or\\nthe maximum number of nodes to plot on the node-link graph visualizations when visualizing query results.\\nDetails of these can be found [here](/kuzuexplorer/settings-panel).\\n\\n## Final Words\\n\\nK\xf9zuExplorer should be quite useful especially when developing your applications for exploration and debugging purposes, e.g.,\\nyou can interactively debug why your queries do not return the results you expect using K\xf9zuExplorer by exploring the\\nactual nodes and relationships in your database.\\n\\nThis is our first version of K\xf9zuExplorer and we will be improving it over time.\\nWe hope you enjoy using K\xf9zuExplorer and help us make it better! Please send us any feature or documentation requests or\\nbug reports by opening an issue in [K\xf9zuExplorer\'s github repo](https://github.com/kuzudb/explorer)!"},{"id":"kuzu-0.0.11-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.11-release","source":"@site/blog/2023-10-19-kuzu-v-0.0.11.md","title":"K\xf9zu 0.0.11 Release","description":"We release K\xf9zu 0.0.11, another minor release. The main new feature of this release is read-only access mode for the database on Linux. The read-only mode enables the upcoming K\xf9zu UI to optionally open a database in read-only mode while allowing other applications to access the same database concurrently.","date":"2023-10-19T00:00:00.000Z","formattedDate":"October 19, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.31,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.11-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zuExplorer: Visualizing Query Results and Schemas","permalink":"/docusaurus/blog/kuzuexplorer"},"nextItem":{"title":"K\xf9zu 0.0.10 Release","permalink":"/docusaurus/blog/kuzu-0.0.10-release"}},"content":"We release K\xf9zu 0.0.11, another minor release. The main new feature of this release is read-only access mode for the database on Linux. The read-only mode enables the upcoming [K\xf9zu UI](https://github.com/kuzudb/kuzu-ui) to optionally open a database in read-only mode while allowing other applications to access the same database concurrently.\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.11)."},{"id":"kuzu-0.0.10-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.10-release","source":"@site/blog/2023-10-14-kuzu-v-0.0.10.md","title":"K\xf9zu 0.0.10 Release","description":"We\'re happy to introduce K\xf9zu 0.0.10, which is a minor release with a bunch of bug fixes and improvements:","date":"2023-10-14T00:00:00.000Z","formattedDate":"October 14, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.7,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.10-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.11 Release","permalink":"/docusaurus/blog/kuzu-0.0.11-release"},"nextItem":{"title":"K\xf9zu 0.0.9 Release","permalink":"/docusaurus/blog/kuzu-0.0.9-release"}},"content":"We\'re happy to introduce K\xf9zu 0.0.10, which is a minor release with a bunch of bug fixes and improvements:\\n- Added the frame of reference encoding for integers. [PR 2140](https://github.com/kuzudb/kuzu/pull/2140)\\n- Fixed slicing of UTF-8 string. [PR 2212](https://github.com/kuzudb/kuzu/pull/2212)\\n- Fixed copying of invalid UTF-8. [PR 2208](https://github.com/kuzudb/kuzu/pull/2208)\\n- Added more checks and better error messages during the binding phase. [PR 2206](https://github.com/kuzudb/kuzu/pull/2206)\\n- Fixed return list literal with null values. [PR 2187](https://github.com/kuzudb/kuzu/pull/2187)\\n- Fixed bugs in scan multi label rel tables. [PR 2149](https://github.com/kuzudb/kuzu/pull/2149)\\n- Deprecated all functions for getting the table names and properties from the client APIs and the CLI, instead, `CALL` is introduced for the same functionality. [PR 2199](https://github.com/kuzudb/kuzu/pull/2199), [2207](https://github.com/kuzudb/kuzu/pull/2207)\\n- Added missing data type support in client APIs. [PR 2183](https://github.com/kuzudb/kuzu/pull/2183), [PR 2176](https://github.com/kuzudb/kuzu/pull/2176), [PR 2193](https://github.com/kuzudb/kuzu/pull/2193), [PR 2172](https://github.com/kuzudb/kuzu/pull/2172)\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.10)."},{"id":"kuzu-0.0.9-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.9-release","source":"@site/blog/2023-10-02-kuzu-v-0.0.9.md","title":"K\xf9zu 0.0.9 Release","description":"We are very happy to release K\xf9zu 0.0.9 today! This release comes with the following new main features and improvements:","date":"2023-10-02T00:00:00.000Z","formattedDate":"October 2, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":7.545,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.9-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.10 Release","permalink":"/docusaurus/blog/kuzu-0.0.10-release"},"nextItem":{"title":"K\xf9zu 0.0.8 Release","permalink":"/docusaurus/blog/kuzu-0.0.8-release"}},"content":"We are very happy to release K\xf9zu 0.0.9 today! This release comes with the following new main features and improvements:\\n\\n\x3c!--truncate--\x3e\\n\\n## New Features\\n\\n### Load From\\nK\xf9zu now supports loading directly from a file without importing into the database through the `LOAD FROM` clause. For instance, the following query counts the number of rows whose first column starts with \'Adam\'.\\n\\n```\\nLOAD FROM \\"user.csv\\"\\nWHERE column0 =~ \'Adam*\'\\nRETURN COUNT(*)\\n```\\n`LOAD FROM` can also be used as the input source for a bulk update.\\n```\\nLOAD FROM \\"user.csv\\"\\nCREATE (:Person {name: column0, age: to_int64(column1)});\\n```\\n\\nDetails can be found in the [LOAD FROM documentation page](/cypher/query-clauses/load_from).\\n\\n#### Header Schema\\nBy default, K\xf9zu will read the header of the file to detect column names and types. If no header is avaliable it will use auto-generated names and all columns will be strings. To manually specify the header, you can use `LOAD WITH HEADERS ... FROM ...`.\\n\\nFor example, the following query will load `name` as a string type for the first column and `age` as an INT64 type for the second column.\\n```\\nLOAD WITH HEADERS (name STRING, age INT64) FROM \\"user.csv\\"\\nWHERE name =~ \'Adam*\'\\nRETURN name, age;\\n```\\n\\nIf a header is manually specified, K\xf9zu will try to cast to the given type and throw exceptions if casting fails. More information can be found [here](/cypher/query-clauses/load_from).\\n\\n### Transaction Statement\\nThis release replaces the `beginReadTransaction()`, `beginWriteTransaction()`, `commit()` and `rollback()` APIs in all language bindings with explicit statements.\\n```\\nBEGIN TRANSACTION;\\nCREATE (a:User {name: \'Alice\', age: 72});\\nMATCH (a:User) RETURN *;\\nCOMMIT;\\n```\\nThe above sequence of statements starts a write transaction, adds a new node, and within the same transaction also reads all of the tuples in User table before committing the transaction. More info on the new transaction statement can be found [here](/cypher/transaction).\\n\\n### Comment on Table\\nYou can now add comments to a table using the `COMMENT ON TABLE` statement. The following query adds a comment to the `User` table.\\n```\\nCOMMENT ON TABLE User IS \'User information\';\\n```\\nComments can be extracted through the new `SHOW_TABLES()` function.\\n```\\nCALL SHOW_TABLES() RETURN *;\\n----------------------------------\\n| name | type | comment |\\n----------------------------------\\n| User | NODE | User information |\\n----------------------------------\\n| City | NODE | |\\n----------------------------------\\n```\\n\\n### Recursive Relationship Projection\\nThis release expands recursive relationship patterns and enables projection on intermediate nodes and relationships. Previously, K\xf9zu only supported returning all node and relationship properties on the path.\\n```\\nMATCH (a:User)-[e:Follows*1..2 (r, n | WHERE r.since > 2020)]->(b:User)\\nRETURN nodes(e), rels(e);\\n```\\nThis incurs a significant computational overhead when a user is only interested in a subset of properties on the path. Also, returning all properties makes the result harder to interpret.\\n\\nK\xf9zu now allows projection inside recursive relationship patterns using a list-comprehension-like syntax.\\n```\\nMATCH (a:User)-[e:Follows*1..2 (r, n | WHERE r.since > 2020 | {r.since}, {n.name})]->(b:User)\\nRETURN nodes(e), rels(e);\\n```\\nThe query above finds all paths between two users which are between 1 and 2 hops, and where the users followed each other after 2020. The query returns the `since` property of any `Follow` relationships and the name of any intermediate users.\\n\\nFor more information, check out [the new documentation](/cypher/query-clauses/match#project-intermediate-nodes-and-rels).\\n\\nThe performance improvements are shown in the [Performance Improvements](#performance-improvements) section.\\n\\n### CREATE REL GROUP[^1]\\n\\nWe have received a lot of feedback regarding the limitation that a relationship can only be defined over a single pair of node tables. This release introduces a `CREATE REL GROUP` statement which has a similar syntax to `CREATE REL TABLE`, but allows multiple `FROM ... TO ...` clauses. This statement will create a relationship table for each pair internally. When querying, a relationship group is simply syntatic sugar for any of the relationships in the group.\\n\\nFor example, the following statement creates a group containing a Knows_User_User relationship and a Knows_User_City relationship.\\n```\\nCREATE REL TABLE GROUP Knows (FROM User To User, FROM User to City, year INT64);\\n```\\nTo query with the group, simply treat it as any other relationship, so:\\n```\\nMATCH (a:User)-[:Knows]->(b) RETURN *;\\n```\\nThe query above is equivalent to\\n```\\nMATCH (a:User)-[:Knows_User_User|:Knows_User_city]->(b) RETURN *;\\n```\\n**Note**\\n- For `COPY FROM` and `CREATE`, we currently don\'t support using a relationship group so you need to explicitly specify a single relationship table.\\n\\nSee [Create Table](/cypher/data-definition/create-table) for more information.\\n\\n### Data Types & Functions\\nWe introduced a few more numerical data types:\\n- INT8: 1 byte signed integer\\n- UINT64: 8 byte unsigned integer\\n- UINT32: 4 byte unsigned integer\\n- UINT16: 2 byte unsigned integer\\n- UINT8: 1 byte unsigned integer\\n\\nWe have also added several casting and list functions. See [functions](/cypher/expressions/) for more information.\\n\\n## Performance Improvements\\n\\n### New CSV and Parquet Reader\\nIn this release, we have started replacing arrow\'s CSV and Parquet reader with our own lightweight and customized implementations.\\n\\nFollowing DuckDB\'s implementation, we\'ve replaced arrow\'s streaming CSV reader with a parallel one. The parallel CSV reader assumes there are no multi-line strings and provides a large performance boost on multi-threaded machines.\\n\\nIf multi-line strings are present, the CSV reading will fail, and you will need to fall back to single thread mode by setting `parallel=false`. See [Data Import from CSV Files](/data-import/csv-import).\\n\\nWe demonstrate the performance of our parallel csv reader through the new [LOAD FROM](#load-from) feature as follows.\\n```\\nLOAD FROM \\"ldbc-100/comment_0_0.csv\\" (header = true, delim = \'|\') RETURN COUNT(*);\\n```\\n\\n| # Threads | 1 | 2 | 4 | 8 | 16 |\\n| --------- | ----- | ----- | ----- | ----- | ----- |\\n| Time (s) | 297.19 | 170.71 (1.7x) | 109.38 (2.7x) | 69.01 (4.3x) | 53.28 (5.6x) |\\n\\n### Bitpacking Compression\\nWith this release, we have implemented our first compression algorithm! We are introducing the bitpacking compression algorithm for integers. It is useful when using a large integer type (e.g., INT32 or INT64) for storing small integers, which can be encoded more compactly with fewer bits. This helps both storage and query processing times.\\n\\nTo show the difference, we take the `length` column from LDBC `Comment` table as an example, which is of type `INT32` and whose values range from 2 to 1998.\\nTogether with an auto-increment `ID` column as the primary key, we create a node table `(ID INT64, length INT32, PRIMARY KEY(ID))`. The loaded data file size, and loading time is listed in the below table. Data file size is largely reduced from 2.6GB to 1.1GB (2.4x), while the data loading time stays the same (75.69s vs. 75.84s).\\n\\nReduced data file size also helps reduce disk I/O operations, which can improve query scan performance. We show that with a query that sums all the lengths.\\n```\\nMATCH (l:length) RETURN sum(l.length);\\n```\\nThe query time improved from 1.64s to 0.45s (3.6x)!\\n\\n| | Data size | Loading time | Query time |\\n| --------------- | --------- | -------------- | ------------ |\\n| Without compression | 2.6GB | 75.69s | 1.64s |\\n| With compression | **1.1GB (2.4x)** | **75.84s** | **0.45s (3.6x)** |\\n\\nMore compressions on integers, floats, and string values will be coming soon. Please stay tuned!\\n\\nNote: The compression is currently only done on node tables. It will be adapted to rel tables in our next release. By default, we turn on compression for all node tables. To disable it, we provide an option when starting the database. For example, starting our CLI with `--nocompress` option can disable compression on all write statements to node tables.\\n\\n### General Data Loading Improvement\\nData loading time is improved due the following changes:\\n- Parallel csv reader.\\n- Compression means we write less data to disk.\\n- Removed line counting when copying rel tables.\\n- Dedicated casting functions to avoid string copy.\\n- Reduced hash index file size.\\n\\n| Files | # Lines | CSV file size | v0.0.8 | v0.0.9 |\\n| ---------------- | ----------- | ------------- | ----------- | ----------- |\\n| comment.csv | 220M | 22.49 GB | 187.76s | **131.48s** |\\n| person.csv | 0.45M | 43.6M | 1.16s | **0.78s** |\\n| likesComment.csv | 242M | 13 GB | 250.64s | **210.72s** |\\n| knows.csv | 20M | 1.1 GB | 24.40s | **19.54s** |\\n\\n\\n### Projection Pushdown for Recursive Joins\\nThe following two queries both compute paths along the Knows relationship with 1 to 3 hops from a single starting point, and then returns the firstName of all nodes along the path.[^2]\\n\\nWithout Projection:\\n```\\nMATCH (a:Person)-[e:Knows*1..3]->(b:Person)\\nWHERE a.ID = 933\\nRETURN properties(nodes(e), \'firstName\');\\n```\\n\\nWith Projection:\\n```\\nMATCH (a:Person)-[e:Knows*1..3 (r, n | {}, {n.firstName})]->(b:Person)\\nWHERE a.ID = 933\\nRETURN properties(nodes(e), \'firstName\');\\n```\\n\\n| With projection | Without projection |\\n|---------------------- | ----------------------- |\\n| **471.9** ms | 3412.8 ms |\\n\\nWith projection, the optimizer can completely avoid materializing a hash table for relationship properties which is a major bottleneck in computation.\\n\\n[^1]: This is an experimental feature and might be changed in the future.\\n[^2]: This experiment was carried out on an M1 Macbook Pro with 16GB of memory and 8 threads. Sideway information passing is disabled."},{"id":"kuzu-0.0.8-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.8-release","source":"@site/blog/2023-08-28-kuzu-v-0.0.8.md","title":"K\xf9zu 0.0.8 Release","description":"We\'re here to introduce K\xf9zu 0.0.8, which is a minor bug-fix release together with some performance optimizations:","date":"2023-08-28T00:00:00.000Z","formattedDate":"August 28, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.64,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.8-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.9 Release","permalink":"/docusaurus/blog/kuzu-0.0.9-release"},"nextItem":{"title":"K\xf9zu 0.0.7 Release","permalink":"/docusaurus/blog/kuzu-0.0.7-release"}},"content":"We\'re here to introduce K\xf9zu 0.0.8, which is a minor bug-fix release together with some performance optimizations:\\n- Fixed a major bug in COPY on large datasets. [PR 1963](https://github.com/kuzudb/kuzu/pull/1963)\\n- Implemented the [TopK optimization](https://github.com/kuzudb/kuzu/pull/1949), significantly enhancing the performance of queries that involve ORDER BY and LIMIT clauses. We will delve deeper into this optimization in a blog post. [PR 1949](https://github.com/kuzudb/kuzu/pull/1949)\\n- WITH clause (CTE) rewriter. We avoid the evaluation of node and rel in CTE projection if it\'s not needed for further processing. [PR 1956](https://github.com/kuzudb/kuzu/pull/1956)\\n- Updated our Rust doc with converting query result to arrow arrays.\\n- Fixed the size allocated for boolean values to match the size of the bit-packed data. [PR 1953](https://github.com/kuzudb/kuzu/pull/1953/files)\\n\\nFor more detailed information about the changes in this release, please see [here](https://github.com/kuzudb/kuzu/releases/tag/v0.0.8)."},{"id":"kuzu-0.0.7-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.7-release","source":"@site/blog/2023-08-16-kuzu-v-0.0.7.md","title":"K\xf9zu 0.0.7 Release","description":"We are very happy to release K\xf9zu 0.0.7 today! This release comes with the following new main features and improvements:","date":"2023-08-16T00:00:00.000Z","formattedDate":"August 16, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":7.53,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.7-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.8 Release","permalink":"/docusaurus/blog/kuzu-0.0.8-release"},"nextItem":{"title":"IAMGraphViz: Visualizing AWS IAM Permissions with K\xf9zu","permalink":"/docusaurus/blog/iamgraphviz"}},"content":"We are very happy to release K\xf9zu 0.0.7 today! This release comes with the following new main features and improvements: \\n\\n- [Macro and UDF](2023-08-16-kuzu-v-0.0.7.md#macro-and-udf)\\n - [Create Macro Statements](2023-08-16-kuzu-v-0.0.7.md#create-macro-statements)\\n - [C++ UDFs](2023-08-16-kuzu-v-0.0.7.md#c-udfs)\\n- [Data Update and Return Clauses](2023-08-16-kuzu-v-0.0.7.md#data-update-and-return-clauses)\\n - [Merge Clause](2023-08-16-kuzu-v-0.0.7.md#merge-clause)\\n - [Multi-label Set/Delete](2023-08-16-kuzu-v-0.0.7.md#multi-label-setdelete)\\n - [Return After Update](2023-08-16-kuzu-v-0.0.7.md#return-after-update)\\n - [Return with .\\\\*](2023-08-16-kuzu-v-0.0.7.md#return-with-)\\n- [Data Export](2023-08-16-kuzu-v-0.0.7.md#data-export)\\n- [New Data Types and APIs](2023-08-16-kuzu-v-0.0.7.md#new-data-types-and-apis)\\n - [MAP](2023-08-16-kuzu-v-0.0.7.md#map)\\n - [UNION](2023-08-16-kuzu-v-0.0.7.md#union)\\n - [Converting Query Results to Arrow](2023-08-16-kuzu-v-0.0.7.md#converting-query-results-to-arrow)\\n- [NodeGroup Based Node Table Storage](2023-08-16-kuzu-v-0.0.7.md#nodegroup-based-node-table-storage)\\n- [Unnesting Arbitrary Subqueries](2023-08-16-kuzu-v-0.0.7.md#unnesting-arbitrary-subqueries)\\n\\n\x3c!--truncate--\x3e\\n\\nFor installing the new version, \\nplease visit the [download section of our website](https://kuzudb.com/#download) \\nand [getting started guide](https://kuzudb.com/docusaurus/getting-started/). The full\\n[release notes are here](https://github.com/kuzudb/kuzu/releases). \\n\\n## Macro and UDF\\n### Create Macro Statements\\nIn this release, we\'ve added the support of `CREATE MACRO` statement to define customized scalar functions, i.e., those that return only a single value, through Cypher.\\n\\nHere is an example of defining a macro to add two input parameters. The second parameter `b:3` is an example of how to provide a default value for a parameter in case the parameter is absent.\\n```Cypher\\n// Create a macro which adds two parameters. If the second parameter b is not provided, the default value of 3 will be used instead.\\ncreate macro addWithDefault(a,b:=3) as a + b;\\n// Executes the macro without providing the default value.\\nreturn addWithDefault(2); // returns 5 (2 + 3)\\n// Executes the macro by providing the default value (actual parameter value will be used).\\nreturn addWithDefault(4, 7); // returns 11 (4 + 7)\\n```\\nSee more details on supported macro expression types [here](./../cypher/macro).\\n\\n### C++ UDFs\\nWe are also introducing two C++ interfaces, `createScalarFunction` and `createVectorizedFunction` in the `Connection` class of the [C++ API](https://kuzudb.com/docusaurus/getting-started/cpp) to define both scalar and vectorized [UDFs](./../client-apis/cpp-api/udf).\\n\\n`createScalarFunction` provides a way for users to define scalar functions in C++ and use it in K\xf9zu as if they\'re built-in functions.\\nHere is an example of a unary scalar function that increments the input value by 5:\\n```cpp\\nstatic int32_t addFiveScalar(int32_t x) {\\n return x + 5;\\n}\\n// Register the unary scalar function using the createScalarFunction API.\\nconn->createScalarFunction(\\"addFiveScalar\\", &addFiveScalar);\\n// Issue a query using the UDF.\\nconn->query(\\"MATCH (p:person) return addFiveScalar(to_int32(p.age))\\");\\n```\\n\\nFor users familiar with internals of our intermediate result representation, they can make use of `createVectorizedFunction` to create vectorized function over our ValueVectors to achieve better performance.\\nSee [our doc here](./../client-apis/cpp-api/udf) for more details.\\n\\n## Data Update and Return Clauses\\n### Merge Clause\\nThis release implements the `MERGE` clause, which is an updating clause that will first try to match the given pattern and, if not found, create the pattern. At a high level, `MERGE ` can be interpreted as `If MATCH then RETURN ELSE CREATE `.Additionally, one can further specify the `SET` operation based on whether the pattern is found or not through `ON CREATE` and `ON MATCH`.\\n\\nFor example, the following query tries to merge a user node with name \\"Adam\\". Suppose a node with name \\"Adam\\" exists in the database already. In this case, we update the same node\'s `age` property and return the node (so no new node gets inserted).\\n```\\nMERGE (n:User {name : \'Adam\'}) ON MATCH SET n.age = 35 RETURN n.*;\\n------------------\\n| n.name | n.age |\\n------------------\\n| Adam | 35 |\\n------------------\\n```\\nHere is another example where we try to merge a `Follows` edge with `since` property equal to 2022 between `Adam` and `Karissa`. Suppose no such edge exists in the database, then the statement create the edge and set the `since` property to 1999.\\n```\\nMATCH (a:User), (b:User) \\nWHERE a.name = \'Adam\' AND b.name = \'Karissa\' \\nMERGE (a)-[e:Follows {since:2022}]->(b) \\nON CREATE SET e.since = 1999\\nRETURN e;\\n---------------------------------------------------------\\n| e |\\n---------------------------------------------------------\\n| (0:0)-{_LABEL: Follows, _ID: 0:5, since: 1999}->(0:1) |\\n---------------------------------------------------------\\n```\\nSee [our doc here](./../cypher/data-manipulation-clauses/merge) for more details.\\n\\n### Multi-label Set/Delete\\n\\nK\xf9zu now allows set/delete on nodes and relationship variables that can be binding to multiple labels. For example,\\nto delete all nodes in database (assuming all edges have been deleted).\\n```\\nMATCH (n) DELETE n;\\n```\\nSimilarly, to set `since` property of all relationships in the database.\\n```\\nMATCH ()-[f]->() SET f.since = 2023\\n```\\nNote that when evaluating this query, tuples in tables that don\'t have `since` property will be ignored.\\n\\nSee our docs in [Set](./../cypher/data-manipulation-clauses/set) and [Delete](./../cypher/data-manipulation-clauses/delete) for more details.\\n\\n### Return After Update\\n\\nWe are also enabling return after updating clause starting from this release. That is updated value will be returned in queries that update values. Here are some examples:\\n\\n```\\nMATCH (u:User)\\nWHERE u.name = \'Adam\' SET u.age = NULL\\nRETURN u.*;\\n------------------\\n| u.name | u.age |\\n------------------\\n| Adam | |\\n------------------\\n\\nMATCH (u1:User), (u2:User)\\nWHERE u1.name = \'Adam\' AND u2.name = \'Noura\' \\nCREATE (u1)-[e:Follows {since: 2011}]->(u2)\\nRETURN e;\\n---------------------------------------------------------\\n| e |\\n---------------------------------------------------------\\n| (0:0)-{_LABEL: Follows, _ID: 0:5, since: 2011}->(0:3) |\\n---------------------------------------------------------\\n```\\n\\nSee our docs in [Set](./../cypher/data-manipulation-clauses/set) and [Delete](./../cypher/data-manipulation-clauses/delete) for more examples.\\n\\n### Return with .*\\nAs a syntactic sugar, K\xf9zu now supports returning all properties of node or rel with *.\\n```\\nMATCH (a:User) RETURN a.*;\\n-------------------\\n| a.name | a.age |\\n-------------------\\n| Adam | 30 |\\n-------------------\\n| Karissa | 40 |\\n-------------------\\n| Zhang | 50 |\\n-------------------\\n| Noura | 25 |\\n-------------------\\n```\\n\\nSee [our doc here](./../cypher/query-clauses/return#returning-node-and-relationship-properties) for more details.\\n\\n## Data Export\\nK\xf9zu now supports exporting query results to CSV files using the `COPY TO` command. For example the following\\n`COPY TO` statement could return the below CSV file.\\n```\\nCOPY (MATCH (u:User) RETURN u.*) TO \'user.csv\';\\n```\\nCSV file:\\n```\\nu.name,u.age\\n\\"Adam\\",30\\n\\"Karissa\\",40\\n\\"Zhang\\",50\\n\\"Noura\\",25\\n```\\nSee [Data Export](../data-export/) for more information.\\n\\n## New Data Types and APIs\\n### MAP\\nA `MAP` is a dictionary of key-value pairs where all keys have the same type and all values have the same type. Different from `STRUCT`, `MAP` doesn\'t require the same key to be present in each row. Therefore, `MAP` is more suitable when the schema is not determined.\\n\\n```\\nRETURN map([1, 2], [\'a\', \'b\']) AS m;\\n--------------\\n| m |\\n--------------\\n| {1=a, 2=b} |\\n--------------\\n```\\n\\nSee [map](../cypher/data-types/map) for more information.\\n\\n### UNION\\nK\xf9zu\'s `UNION` is implemented by taking DuckDB\'s `UNION` type as a reference. Similar to C++ `std::variant`, `UNION` is a nested data type that is capable of holding multiple alternative values with different types. The value under key \\"tag\\" is considered as the value being currently hold by the `UNION`.\\n\\nSee [union](../cypher/data-types/union) for more information.\\n\\n### Converting Query Results to Arrow\\nIn previous releases, we supported converting query result to Arrow tables in our [Python API](https://kuzudb.com/api-docs/python/kuzu/query_result.html#QueryResult.get_as_arrow).\\nIn this release, converting to Arrow arrays are now also available in Rust, [C](https://kuzudb.com/api-docs/c/kuzu_8h.html) (see `kuzu_query_result_get_arrow_schema` and `kuzu_query_result_get_next_arrow_chunk`), and [C++](https://kuzudb.com/api-docs/cpp/classkuzu_1_1main_1_1_query_result.html) (see `getArrowSchema` and `getNextArrowChunk`) APIs.\\n\\n## NodeGroup Based Node Table Storage\\nThis release introduces changes the storage layout of node tables.\\nBefore this release, we used to store each column in a node table contiguously in separate files.\\nEach column contains one data file (e.g., `n-1.col`) and one null file (e.g., `n-1.null`) if the column may contain null values.\\nThis design posed two problems: 1) it requires maintaining many files in the database directory, which may lead to `too many open files` error; 2) it is not suitable for data compression. Although we still don\'t implement compression yet (this will wait until the next few releases), this design would force us to adopt a single compression technique for the entire column. \\n\\nInstead, partitioning each column into multiple chunks can offer more flexibility as each column chunk can be compressed and decompressed independently.\\nIn this release, we introduced the concept [NodeGroup](https://github.com/kuzudb/kuzu/issues/1474), which is equivalent to [RowGroup](https://parquet.apache.org/docs/concepts/) and represents a horizontal partition of a table.[^1] \\nWith node group-based storage design, we also store data of all columns in a single file `data.kz`.[^2]\\nThis will enable more powerful compression schemes, e.g., constant compression, bit-packing, dictionary compression in the coming releases.\\nFor details on our new design, please visit [this issue](https://github.com/kuzudb/kuzu/issues/1474).\\n\\n[^1]: We use the term NodeGroup mainly due to that we also partition rel tables based on their src/dst nodes, instead of number of rows.\\n[^2]: Primary key index files are still kept separately, but eventually they will also be merged into the `data.kz` file.\\n\\n## Unnesting Arbitrary Subqueries\\n\\nConsider the following query that finds the name of users `a` who have at least 1 user `b` who is younger than `a`:\\n```\\nMATCH (a:User) \\nWHERE EXISTS { MATCH (a)-[:Follows]->(b:User) WHERE a.age > b.age} \\nRETURN a.name;\\n```\\nThe query inside `EXISTS` is a correlated subquery and very expensive to evaluate because the inner subquery needs to be evaluated for each `a` with a nested loop join operator (which is often an inefficient way to evaluate joins). In this release, we implemented an optimization that unnests correlated subqueries based on the techniques adopted from this paper [Unnesting Arbitrary Queries](https://cs.emis.de/LNI/Proceedings/Proceedings241/383.pdf) by Neumann and Kemper. This allows us to use hash joins instead of nested loop joins and execute these queries much faster. More details will come in a separate blog post on both this technique and how much gains we obtain."},{"id":"iamgraphviz","metadata":{"permalink":"/docusaurus/blog/iamgraphviz","source":"@site/blog/2023-07-19-iamgraphviz/index.md","title":"IAMGraphViz: Visualizing AWS IAM Permissions with K\xf9zu","description":"IAMGraphViz Overview","date":"2023-07-19T00:00:00.000Z","formattedDate":"July 19, 2023","tags":[{"label":"use-case","permalink":"/docusaurus/blog/tags/use-case"}],"readingTime":6.03,"hasTruncateMarker":true,"authors":[{"name":"Chris Norman","title":"Common Fate","url":"https://www.linkedin.com/in/chrnorm/?originalSubdomain=uk","image_url":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75","imageURL":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75"},{"name":"Chang Liu","url":"https://www.linkedin.com/in/mewim/","imageURL":"https://kuzudb.com/img/blog/chang.gif","key":"chang"}],"frontMatter":{"slug":"iamgraphviz","authors":[{"name":"Chris Norman","title":"Common Fate","url":"https://www.linkedin.com/in/chrnorm/?originalSubdomain=uk","image_url":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75","imageURL":"https://www.commonfate.io/_next/image?url=%2Fheadshots%2Fchris.jpg&w=3840&q=75"},"chang"],"tags":["use-case"]},"prevItem":{"title":"K\xf9zu 0.0.7 Release","permalink":"/docusaurus/blog/kuzu-0.0.7-release"},"nextItem":{"title":"K\xf9zu 0.0.6 Release","permalink":"/docusaurus/blog/kuzu-0.0.6-release"}},"content":"import SchemaImage from \'./schema.png\';\\nimport ReadOnlyVizImage from \'./readonlyviz.png\';\\nimport AdminVizImage from \'./adminviz.png\';\\n\\n\\n## IAMGraphViz Overview\\n\\n[Common Fate](https://www.commonfate.io/) is a framework for managing complex cloud permissions. They provide tools to simplify access at scale to AWS, Azure, and Google Cloud accounts. You can learn about what you can do with Common Fate on [their website](https://www.commonfate.io/). Here, we will talk about a recent proof of concept graph visualization tool called IAMGraphViz that [Chang Liu](https://www.linkedin.com/in/mewim/) (who is coauthoring this post) and I developed using K\xf9zu! IAMGraphViz is intended for infrastructure engineers to dig deep into the permission assignments in AWS IAM Identity Center using graph visualization. Using IAMGraphViz, one can easily visualize who has what type of access to different accounts on AWS as well as how they have access to these accounts. This is all done by analyzing the paths from users to accounts in a graph visualization, where the nodes and edges model users, accounts, groups, group memberships, permission sets and other entities in the AWS IAM Identity Center system.\\n\\n\x3c!--truncate--\x3e\\n\\nThe IAMGraphViz project is designed and implemented as a web application using a graph DBMS (GDBMS) to store and retrieve data. Before landing on K\xf9zu, we surveyed using several other GDBMSs, such as Neo4j, but they were all harder to use. Neo4j, for example, requires hosting a separate database. We then discovered K\xf9zu, which only required a `pip install` and import statement and we could simply embed it into our application. In this project our datasets could fit entirely onto a single compute node,and so K\xf9zu was far simpler for us to work with than alternatives. K\xf9zu is also far cheaper and more serverless-friendly than running a separate database.\\n\\nThis post follows the [Colab](https://colab.research.google.com/drive/1fotlNnOj1FGad6skBG7MRrHVdHd3jIl6) that Chang Liu created after we discussed this use case together.\\n\\nSo let\'s get to it!\\n\\n## Quick AWS IAM Overview\\n\\nWe will use the data model shown in the figure below that faithfully (but partially) models the\\ncore concepts of AWS IAM permission management. Let\'s first review these concepts, all\\nof which will be modeled as nodes in K\xf9zu, as a background.\\nWe will provide as simple definitions as we can to keep the post short and provide links\\nto necessary AWS IAM documentation: \\n\\n
\\n\\n
\\n\\n1. **[User](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users.html)** represents a \\nuser, e.g., an actual human user, who can get access to AWS accounts (and through accounts to AWS resources).\\n\\n2. **[Group](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_groups.html)** is a collection of IAM users and lets you specify permissions for multiple users at a time (e.g., you could have a user group called Admins with typical administrator permissions).\\nTo follow the APIs we use, instead of linking Users to Groups through a direct edge, we will do this (a bit redundantly) through a GroupMembership node.\\n\\n3. **[Account](https://docs.aws.amazon.com/organizations/latest/userguide/orgs_getting-started_concepts.html#account)**: An AWS account is the basic container for your AWS resources, such as s3 buckets,\\nAmazon Relational Database Service (Amazon RDS) databases, or Amazon Elastic Compute Cloud instances.\\nUsing multiple AWS accounts is a common practice for many reasons, e.g., providing a natural billing boundary for costs or isolating resources for security. Common Fate customers have hundreds of \\naccounts, which is not an extreme.\\n\\n4. **[IAM Policy](https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies.html)**, and **[ManagedPolicy](https://docs.aws.amazon.com/IAM/latest/UserGuide/access_policies_managed-vs-inline.html#aws-managed-policies)**: An IAM policy contains permissions for using some AWS resources. An AWS managed policy is a policy with a unique Amazon Resource Name (ARN), e.g., `arn:aws:iam::aws:policy/IAMReadOnlyAccess`, that is administered by AWS. Managed policies are common policies used by many enterprises. Managed policies are simpler to use than writing your custom policies. \\nFor simplicity, we will only model AWS managed policies in this post.\\n\\n5. **[PermissionSet](https://docs.aws.amazon.com/singlesignon/latest/userguide/permissionsetsconcept.html)** is a set of policies that can be attached to users or groups (through AccountAssignments which we explain momentarily). For example, you can create a Database Admin permission set that includes policies for administering Amazon RDS, DynamoDB, and Aurora services, and use that single permission set to grant access to a list of target AWS accounts. Similar to GroupMembership nodes, to follow the APIs we use, instead of linking ManagedPolicy nodes to PermissionSet nodes through a direct edge, we will link them through a ManagedPolicyAttachment node.\\n\\n6. **[Account Assignment](https://aws.amazon.com/about-aws/whats-new/2020/09/aws-single-sign-on-adds-account-assignment-apis-and-aws-cloudformation-support-to-automate-multi-account-access-management/)**: We will connect user and/or groups to AWS accounts with a specific permission set through an `AccountAssignment` node (see the schema above). \\n\\n## Example Visualizations\\n\\n### Data Generation\\nIn the attached [Colab notebook](https://colab.research.google.com/drive/1fotlNnOj1FGad6skBG7MRrHVdHd3jIl6), we first generate some test data\\ncontaining Users, Groups, ManagedPolicies, PermissionSets etc. For simplicity, we assume that there are three fixed groups: \\"Admins\\", \\"Developers\\", and \\"Auditors\\" and three ManagedPolicies: \\"AdministratorAccess\\", \\"PowerUserAccess\\", and \\"ReadOnlyAccess\\". Users, Accounts, \\nAccountAssignments, and PermissionSets are randomly generated and we randomly link different nodes to\\nother nodes according to our schema.\\n\\n### Visualization 1: Draw all users with direct or indirect `ReadOnlyAccess` access to an account\\n\\nIn our first query, we are given a particular account we would like to investigate and find\\nall users who have `ReadOnlyAccess` to the resources of this account. Let\'s assume\\nthe account\'s name is \\"account-db2071\\".\\n \\n``` cypher\\nMATCH (u:User)<-[l*1..3]-(aa:AccountAssignment)-[l5]-(a:Account),\\n(aa:AccountAssignment)-[aaps]->(ps:PermissionSet)<-[psmpa]-(mpa:ManagedPolicyAttachment)-[mpap]->(p:ManagedPolicy)\\nWHERE p.id = \\"arn:aws:iam::aws:policy/ReadOnlyAccess\\" AND a.sid = \\"account-db2071\\"\\nRETURN *;\\n```\\n\\nIn the actual IAMGraphViz implementation, we template this query with two parameters, one for the \\naccount ID, and one for the managed policy, which users pick interactively by selecting from\\na dropdown menu.\\nNote also that the `[:*1..3]` binding is a variable-length path because we want to find\\nboth the direct connections from a `User` to an `AccountAssignment` (that is further connected to\\n`ManagedPolicy`) as well as \\nindirect connections through a `Group` node. The visualization we generate is shown below:\\n\\n
\\n\\n
\\n\\nNote the presence of both directly and indirectly connected users to the account.\\nThe visualization in both the actual implementation and the [Colab notebook](https://colab.research.google.com/drive/1fotlNnOj1FGad6skBG7MRrHVdHd3jIl6) is generated simply \\nby converting the results of the query into the node and link objects of the graph visualization library,\\ne.g., pyvis in the case of the Colab notebook.\\n\\n### Visualization 2: Draw all accounts a user has `AdministratorAccess` to\\n\\nIn our second query, we are given a particular user we would like to investigate and find all accounts that the user has `AdministratorAccess` to. Let\'s assume the user\'s name is \\"Steven Rose\\". \\n\\nTo retrive the accounts, we define a Cypher query very similar to the previous one. The only difference is that, instead of using the account as query predicate, we now use the user. The query is as follows:\\n\\n``` cypher\\nMATCH (u:User)<-[l*1..3]-(aa:AccountAssignment)-[l5]-(a:Account),\\n(aa:AccountAssignment)-[aaps]->(ps:PermissionSet)<-[psmpa]-(mpa:ManagedPolicyAttachment)-[mpap]->(p:ManagedPolicy)\\nWHERE p.id = \\"arn:aws:iam::aws:policy/AdministratorAccess\\" AND u.name = \\"Steven Rose\\"\\nRETURN *;\\n```\\n\\nThe visualization we generate is shown below:\\n\\n
\\n\\n
\\n\\n## Closing Words\\nMany other graph visualizations can be helpful for infrastructure engineers to analyze the \\nIAM network of an enterprise. For example, to find inconsistent privileges given to users,\\nwe might want to *find and plot multiple paths from a user to an account with different privileges*.\\nOr we might want to extend our model with more fine grained resources that are connected to accounts\\nand analyze paths from users to these resources (see the [PMapper](https://github.com/nccgroup/PMapper) project that models the IAM data in a more detailed way). The key takeaway is this: graph visualizations can be very powerful to analyze cloud permission data and embedding K\xf9zu into your applications\\nto develop tools like IAMGraphViz is extremely easy and fun \ud83e\udd73\ud83d\ude4c\ud83d\udcaa!"},{"id":"kuzu-0.0.6-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.6-release","source":"@site/blog/2023-07-17-kuzu-v-0.0.6.md","title":"K\xf9zu 0.0.6 Release","description":"We are thrilled to announce the release of K\xf9zu 0.0.6, which focuses on addressing bugs reported by our users. We addressed the following issues in this bug-fix release:","date":"2023-07-17T00:00:00.000Z","formattedDate":"July 17, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":0.575,"hasTruncateMarker":false,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.6-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"IAMGraphViz: Visualizing AWS IAM Permissions with K\xf9zu","permalink":"/docusaurus/blog/iamgraphviz"},"nextItem":{"title":"K\xf9zu 0.0.5 Release","permalink":"/docusaurus/blog/kuzu-0.0.5-release"}},"content":"We are thrilled to announce the release of K\xf9zu 0.0.6, which focuses on addressing bugs reported by our users. We addressed the following issues in this bug-fix release:\\n\\n1. Resolved a segmentation fault occurring while loading overflow data types with parallelism.\\n2. Fixed an issue of reading out of bound for LIST vector null buffer.\\n3. Implemented several missing data types in C, JAVA, Rust, and Python API bindings.\\n\\nFor more detailed information about the changes in this release, please visit [this link](https://github.com/kuzudb/kuzu/releases/tag/v0.0.6). \\n\\nWe extend our sincere gratitude to all our users who reported these bugs, as well as to everyone who supported us throughout this process. Your feedback is instrumental in making K\xf9zu better!"},{"id":"kuzu-0.0.5-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.5-release","source":"@site/blog/2023-07-10-kuzu-v-0.0.5.md","title":"K\xf9zu 0.0.5 Release","description":"We are very happy to release K\xf9zu 0.0.5 today! This release comes with the following new main features and improvements:","date":"2023-07-10T00:00:00.000Z","formattedDate":"July 10, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":4.14,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.5-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.6 Release","permalink":"/docusaurus/blog/kuzu-0.0.6-release"},"nextItem":{"title":"K\xf9zu 0.0.4 Release","permalink":"/docusaurus/blog/kuzu-0.0.4-release"}},"content":"We are very happy to release K\xf9zu 0.0.5 today! This release comes with the following new main features and improvements: \\n\\n- [Cypher Features](2023-07-10-kuzu-v-0.0.5.md#cypher-features)\\n - [Named Path](2023-07-10-kuzu-v-0.0.5.md#named-path)\\n - [Filters of Relationships in Recursive Patterns](2023-07-10-kuzu-v-0.0.5.md#filters-of-relationships-in-recursive-patterns)\\n - [All Shortest Paths](2023-07-10-kuzu-v-0.0.5.md#all-shortest-paths)\\n - [`Call` Clause](2023-07-10-kuzu-v-0.0.5.md#call-clause)\\n- [Modifying Database Configuration](2023-07-10-kuzu-v-0.0.5.md#modifying-database-configurations)\\n- [Data Types](2023-07-10-kuzu-v-0.0.5.md#data-types)\\n - [`BLOB`](2023-07-10-kuzu-v-0.0.5.md#blob)\\n- [Client APIs: Rust and Java](2023-07-10-kuzu-v-0.0.5.md#client-apis-rust-and-java)\\n- [Development:Testing Framework](2023-07-10-kuzu-v-0.0.5.md#development-testing-framework)\\n\\n\x3c!--truncate--\x3e\\n\\n## Cypher Features\\n\\n### Named Paths\\nThis releases introduces named paths. Users can now assign a named variable to a connected graph pattern. For example, the following query returns all path between `Adam` and `Karissa`.\\n```\\nMATCH p = (a:User)-[:Follows]->(b:User) \\nWHERE a.name = \'Adam\' AND b.name = \'Karissa\' \\nRETURN p;\\n```\\nNamed paths can also be assigned to recursive graph patterns as follows:\\n```\\nMATCH p = (a:User)-[:Follows*1..2]->(:User)-[:LivesIn]->(:City) \\nWHERE a.name = \'Adam\' \\nRETURN p;\\n```\\nOne can also assign multiple named paths in a `MATCH` clause\\n```\\nMATCH p1 = (a:User)-[:Follows]->(b:User), p2 = (b)-[:LivesIn]->(:City) \\nWHERE a.name = \'Adam\' \\nRETURN p1, p2;\\n```\\nInternally, a path is processed as a `STRUCT` with two fields, a nodes field with key `_NODES` and type `LIST[NODE]` and a rels field with key `_RELS` and type `LIST[REL]`. See [`PATH`](https://kuzudb.com/docusaurus/cypher/data-types/path) for details. Users can access nodes and rels field with `nodes(p)` and `rels(p)` function calls as follows:\\n```\\nMATCH p = (a:User)-[:Follows*1..2]->(:User) \\nWHERE a.name = \'Adam\' \\nRETURN nodes(p), (rels(p)[1]).since;\\n```\\n\\n### Filters of Relationships in Recursive Patterns\\nUsers can now put predicates on the relationships that will be \\"traversed/joined\\" in recursive patterns.\\nFor example, the following query finds the name of users that are followed by Adam directly or indirectly through 2 hops where *the following started before 2022 (r.since < 2022 predicate)*:\\n```\\nMATCH p = (a:User)-[:Follows*1..2 (r, _ | WHERE r.since < 2022)]->(b:User)\\nWHERE a.name = \'Adam\' \\nRETURN DISTINCT b.name;\\n```\\nOur filter grammar follows [Memgraph\'s syntax](https://memgraph.com/docs/memgraph/reference-guide/built-in-graph-algorithms). The first variable `r` in the `(r, _ | WHERE r.since < 2022)` predicate binds to the relationships in the recursive pattern and the `_` binds to the nodes. Since we currently don\'t allow filters on recursive nodes, the second variable must be `_` for now.\\n\\n### All Shortest Paths\\nK\xf9zu now supports all shortest paths semantic with key word `ALL SHORTEST`. The following query finds all shortest paths of up to length 3 between `Zhang` and `Waterloo` considering relationships of all labels (i.e., this is an unlabeled query and you can restrict the labels by adding them as `[:Follows* ALL SHORTEST 1..3]`).\\n```\\nMATCH p = (a)-[* ALL SHORTEST 1..3]-(b) \\nWHERE a.name = \'Zhang\' AND b.name = \'Waterloo\' \\nRETURN p;\\n```\\nSee [All Shortest Paths](https://kuzudb.com/docusaurus/cypher/query-clauses/match#all-shortest-path) on our documentation for more information.\\n\\n### `Call` Clause\\n\\nThis release introduces `Call` as a reading clause. Similar to [Neo4j](https://neo4j.com/docs/cypher-manual/current/clauses/call/), `Call` clause is used to execute procedures. The release also contains a set of predefined procedures that can be used to query the database schemas. For example, the following query returns all metadata of `User` table:\\n```\\nCALL table_info(\'User\') RETURN *;\\n---------------------------------------------\\n| property id | name | type | primary key |\\n---------------------------------------------\\n| 0 | name | STRING | True |\\n---------------------------------------------\\n| 1 | age | INT64 | False |\\n---------------------------------------------\\n```\\n\\n`Call` can be used together with other clauses in the same way as a reading clause:\\n```\\nCALL table_info(\'User\') WITH * WHERE name STARTS WITH \'a\' RETURN name;\\n--------\\n| name |\\n--------\\n| age |\\n--------\\n```\\n\\nMore built in procedures can be found [here](https://kuzudb.com/docusaurus/cypher/query-clauses/call).\\n\\n## Modifying Database Configurations\\n\\n`CALL` has another usage: you can now modify database configurations through a `Call param=x` pattern. For example, the following sets the maximum number of threads for query execution to 5:\\n```\\nCALL THREADS=5;\\n```\\n\\nMore configuration options can be found [here](https://kuzudb.com/docusaurus/cypher/configuration).\\n\\n## Data Types\\n\\n### `BLOB`\\n\\nWe have also added the `BLOB` type to store arbitrary binary objects. Here is an example query returning a blob:\\n\\n```\\nRETURN BLOB(\'\\\\\\\\xBC\\\\\\\\xBD\\\\\\\\xBA\\\\\\\\xAA\') as result;\\n---------------------------------------------\\n| result |\\n---------------------------------------------\\n| \\\\xBC\\\\xBD\\\\xBA\\\\xAA |\\n---------------------------------------------\\n```\\n\\nMore information on the blob data type can be found [here](https://kuzudb.com/docusaurus/cypher/data-types/blob).\\n\\n## Client APIs: Rust and Java\\nIn this release, we\'re expanding the accessibility of K\xf9zu, bridging the gap with some of the most popular programming languages in the developer community. Specifically, we now have [Rust](https://kuzudb.com/docusaurus/client-apis/rust) and [Java](https://kuzudb.com/docusaurus/client-apis/java) APIs.\\n\\n## Development: Testing Framework\\nStarting with this release, we\'re adding some development guidelines to encourage and facilitate outside contributions from the broader open source community.\\n\\nTesting is a crucial part of K\xf9zu to ensure the correct functioning of the system.\\nIn this release, we\'ve implemented significant changes to our testing framework. Our approach to testing is rooted in the principle of end-to-end tests rather than individual unit tests.\\nWhenever possible, we route all tests in the end-to-end way through Cypher statements. \\nTo this end, we\'ve designed a custom testing framework that enables thorough end-to-end testing via Cypher statements.\\n\\nOur testing framework draws inspiration from [SQLLogicTest](https://www.sqlite.org/sqllogictest/doc/trunk/about.wiki), albeit with customized syntax tailored to our needs.\\nFor a more detailed overview of our testing framework, please visit [here](https://kuzudb.com/docusaurus/development/testing-framework)."},{"id":"kuzu-0.0.4-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.4-release","source":"@site/blog/2023-06-05-kuzu-v-0.0.4.md","title":"K\xf9zu 0.0.4 Release","description":"We are very happy to release K\xf9zu 0.0.4 today! This release comes with the following new main features and improvements:","date":"2023-06-05T00:00:00.000Z","formattedDate":"June 5, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":7.01,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.4-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"K\xf9zu 0.0.5 Release","permalink":"/docusaurus/blog/kuzu-0.0.5-release"},"nextItem":{"title":"Scaling Pytorch Geometric GNNs With K\xf9zu","permalink":"/docusaurus/blog/kuzu-pyg-remote-backend"}},"content":"We are very happy to release K\xf9zu 0.0.4 today! This release comes with the following new main features and improvements: \\n- [Data Ingestion Improvements](2023-06-05-kuzu-v-0.0.4.md#data-ingestion-improvements)\\n- [New Cypher Features](2023-06-05-kuzu-v-0.0.4.md#new-cypher-features)\\n - [Undirected Relationships in Queries](2023-06-05-kuzu-v-0.0.4.md#undirected-relationships-in-queries)\\n - [Recursive Queries: Shortest Path Queries and Improved Variable-length Queries](2023-06-05-kuzu-v-0.0.4.md#recursive-queries-shortest-path-queries-and-improved-variable-length-queries)\\n- [New Data Types](2023-06-05-kuzu-v-0.0.4.md#new-data-types)\\n - [`SERIAL`](2023-06-05-kuzu-v-0.0.4.md#serial)\\n - [`STRUCT`](2023-06-05-kuzu-v-0.0.4.md#struct)\\n- [Client APIs](2023-06-05-kuzu-v-0.0.4.md#client-apis)\\n - [Windows compatibility](2023-06-05-kuzu-v-0.0.4.md#windows-compatibility)\\n - [C](2023-06-05-kuzu-v-0.0.4.md#c)\\n - [Node.js](2023-06-05-kuzu-v-0.0.4.md#nodejs)\\n\x3c!--truncate--\x3e\\n\\n\\n## Data Ingestion Improvements\\nWe continue to improve our data ingestion in this release. \\nWe still rely on Apache Arrow to parse parquet and csv files.\\nSeveral bottlenecks in our earlier implementation are identified and optimized now, including copying from arrow arrays and construction of hash indexes.\\nWe now also store null bits separately, which simplifies our loading logic and makes it faster.\\n\\nHere are some benchmark numbers for loading two node and two rel tables that only contain primitive types or strings from the LDBC benchmark:\\n\\n- CPU: MAC M1 MAX\\n- Disk: 2TB SSD\\n- System Memory: 32GB\\n- Dataset: LDBC-100\\n- Number of thread: 10\\n\\n| Files | # lines | file size | v0.0.3 | v0.0.4\\n| ----------- | ----------- | ----------- | ----------- | ----------- |\\n| comment.csv | 220M | 22.49 GB | 890s | **108s (8.2x)** |\\n| post.csv | 58M | 7.68 GB | 304s | **32s (9.5x)** |\\n| likesComment.csv | 242M | 13 GB | 772s | **142s (5.4x)** |\\n| knows.csv | 20M | 1.1 GB | 80s | **21s (3.8x)** |\\n\\nBesides performance improvement, we now also allow interrupting `COPY` statements in the shell.\\nYou can interrupt long running `COPY` statements without crashing the shell.\\n\\nWe will continue to improve our data ingestion to make it more efficient and robust as we\'re moving to the [new storage design](https://github.com/kuzudb/kuzu/issues/1474) in the coming releases. Please stay tuned!\\n\\n## New Cypher Features\\n\\n### Undirected Relationships in Queries\\nK\xf9zu now supports undirected relationships in Cypher queries. An undirected relationship is the union of both in-coming and out-going relationships. This feature is mostly useful in the following two cases. \\n\\n**Case 1: Relationship is undirected by nature**\\nRelationships between nodes in K\xf9zu are currently directed (though we are internally debating to add a native undirected relationship type). A relationship file must contain `FROM` and `TO` columns each of which refers to a primary key column of a node table. However, sometimes the nature of the relationships are undirected, e.g., an `isFriendOf` relationships in a social network. \\n\\nCurrently, you have two options: (1) you can either store each friendship twice, e.g., `Alice isFriendOf Bob` and `Bob isFriendOf Alice`. This is a bad choice because internally K\xf9zu will index each edge twice (in the forward and backward) edges, so this one fact ends up getting stored 4 times. Or (2) you can store it once, say `Alice isFriendOf Bob`. \\n\\nThe advantage of option (1) was that in K\xf9zu v 0.0.3, if you want to find all friends of `Alice`, you could simply ask this query:\\n```\\nMATCH (a:Person)-[:isFriendOf]->(b:Person)\\nWHERE a.name = \'Alice\' RETURN b;\\n```\\nInstead, if you chose option (2), you would have to ask two queries, one to `MATCH (a:Person)-[:isFriendOf]->(b:Person)` and the other to `MATCH (a:Person)<-[:isFriendOf]-(b:Person)`, and `UNION` them, which gets messy if you want to do more with those neighbors (e.g., find their neighbors etc.). \\n\\nWith undirected edge support, you can now choose option (2) and find `Alice`\'s friends with:\\n```\\nMATCH (a:Person)-[:isFriendOf]-(b:Person)\\nWHERE a.name = \'Alice\'\\nRETURN b;\\n```\\nSo if you do not specify a direction in your relationships, K\xf9zu will automatically query both the forward and backward relationships for you.\\n\\n*Note from K\xf9zu developers: As noted above, we are debating a native undirected relationship type. That seems to solve the problem of, in which fake direction should an undirected relationship be saved at? Should be a `Alice-[isFriendOf]->Bob` or vice versa. Happy to hear your thoughts on this.*\\n\\n**Case 2: Relationship direction is not of interest**\\nAlthough relationship is stored in a directed way, its direction may not be of interest in the query. The following query tries to find all comments that have interacted with comment `K\xf9zu`. These comments could be either replying to or replied by `K\xf9zu`. The query can be asked naturally in an undirected way.\\n\\n```\\nMATCH (c:Comment)-[:replyOf]-(other:Comment)\\nWHERE c.author = \'K\xf9zu\'\\nRETURN other;\\n```\\n\\n### Recursive Queries: Shortest Path Queries and Improved Variable-length Queries\\nThis release brings in the beginnings of a series of major improvements we will do to recursive joins.\\nThe two major changes in this release are: \\n\\n**Multilabeled and undirected Variable-length Join Queries**\\nPrior to this release we supported variable-length join queries only in the restricted case when the variable-length relationship could have a single relationship label and was directed. For example you could write this query:\\n```\\nMATCH (a:Person)-[:knows*1..2]->(b:Person)\\nWHERE a.name = \'Alice\' \\nRETURN b\\n```\\nBut you couldn\'t ask for arbitrary labeled variable-length relationships between Persons `a` and `b` (though you\\ncould write the non-recursive version of that query: `MATCH (a:Person)-[:knows]->(b:Person) ...`. \\nSimilarly we did not support undirected version of the query: `MATCH (a:Person)-[:knows*1..2]-(b:Person)`.\\nK\xf9zu now supports multi-label as well as undirected variable-length relationships.\\nFor example, the following query finds all nodes that are reachable within 1 to 3 hops from `Alice`, irrespective\\nof the labels on the connections or destination `b` nodes:\\n```\\nMATCH (a:Person)-[e:*1..3]-(b)\\nWHERE a.name = \'Alice\'\\nRETURN b;\\n```\\n\\n**Shortest path**\\n\\nFinally, we got to implementing an initial version of shortest path queries. You can find (one of the) shortest paths between nodes by adding the `SHORTEST` keyword to a varible-length relationship. The following query asks for a shortest path between `Alice` and all active users that `Alice` follows within 10 hops and return these users, and the length of the shortest path.\\n\\n```\\nMATCH (a:User)-[p:Follows* SHORTEST 1..10]->(b:User)\\nWHERE a.name = \'Alice\' AND b.state = \'Active\'\\nRETURN b, p, length(p)\\n```\\n\\nThe `p` in the query binds to the sequences of relationship, node, relationship, node, etc. Currently we only return the internal IDs of the relationships and nodes (soon, we will return all their properties).\\n\\n## New Data Types\\n\\n### `SERIAL`\\nThis release introduces `SERIAL` data type. Similar to `AUTO_INCREMENT` supported by many other databases, `SERIAL` is mainly used to create \\nan incremental sequence of unique identifier column which can serve as a primary key column.\\n\\nExample:\\n\\n`person.csv`\\n```\\nAlice\\nBob\\nCarol\\n```\\n\\n```\\nCREATE NODE TABLE Person(ID SERIAL, name STRING, PRIMARY KEY(ID));\\nCOPY Person FROM `person.csv`;\\nMATCH (a:Person) RETURN a;\\n```\\nOutput:\\n```\\n-------------------------------------------\\n| a |\\n-------------------------------------------\\n| (label:Person, 3:0, {ID:0, name:Alice}) |\\n-------------------------------------------\\n| (label:Person, 3:1, {ID:1, name:Bob}) |\\n-------------------------------------------\\n| (label:Person, 3:2, {ID:2, name:Carol}) |\\n-------------------------------------------\\n```\\n\\nWhen the primary key of your node tables are already consecutive integers starting from 0, you should omit the primary key column in the input file and make primary key a SERIAL type. This will improve loading time significantly. Similarly, queries that need to scan primary key will also get faster. That\'s because internally we will not store a HashIndex or primary key column so any scan over primary key will not trigger a disk I/O.\\n\\n### `STRUCT`\\nK\xf9zu now supports `STRUCT` data type similar to [composite type](https://www.postgresql.org/docs/current/rowtypes.html) in Postgres. Here is an example:\\n\\n```\\nWITH {name:\'University of Waterloo\', province:\'ON\'} AS institution\\nRETURN institution.name AS name;\\n```\\nOutput:\\n```\\n--------------------------\\n| name |\\n--------------------------\\n| University of Waterloo |\\n--------------------------\\n```\\nWe support storing structs as node properties for now. For example you can create: `CREATE NODE TABLE Foo(name STRING, exStruct STRUCT(x INT16, y STRUCT(z INT64, w STRING)), PRIMARY KEY (name))`. We will support storing structs on relationships soon. As shown in the `CREATE NODE` example above, you can store arbitrarily\\nnested structs, e.g., structs that contain structs as a field, on nodes. One missing feature we have for now is storing and processing a `LIST` composite type. \\n\\n**Note**: Updating `STRUCT` column with update statement is not supported in this release but will come soon.\\n\\n## Client APIs\\n\\n### Windows compatibility\\nDevelopers can now build K\xf9zu from scratch on Windows platform! Together with this release we also provide pre-built libraries and python wheels on Windows.\\n\\n### C\\nWe provide official C language binding in this release. Developers can now embed K\xf9zu with native C interfaces.\\n\\n### Node.js\\nWe provide official Node.js language binding. With Node.js API, developer can leverage K\xf9zu analytical capability in their Node.js projects. We will\\nsoon follow this blog post with one (or a few) blog posts on developing some applications with Node.js."},{"id":"kuzu-pyg-remote-backend","metadata":{"permalink":"/docusaurus/blog/kuzu-pyg-remote-backend","source":"@site/blog/2023-05-10-kuzu-pyg-rb.md","title":"Scaling Pytorch Geometric GNNs With K\xf9zu","description":"In this post, we\'ll walk through how to use K\xf9zu as a Pytorch Geometric (PyG) Remote Backend to train a GNN model on very large graphs that do not fit on your machine\'s RAM.","date":"2023-05-10T00:00:00.000Z","formattedDate":"May 10, 2023","tags":[{"label":"use-case","permalink":"/docusaurus/blog/tags/use-case"}],"readingTime":12.39,"hasTruncateMarker":true,"authors":[{"name":"Chang Liu","url":"https://www.linkedin.com/in/mewim/","imageURL":"https://kuzudb.com/img/blog/chang.gif","key":"chang"},{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"kuzu-pyg-remote-backend","authors":["chang","semih"],"tags":["use-case"]},"prevItem":{"title":"K\xf9zu 0.0.4 Release","permalink":"/docusaurus/blog/kuzu-0.0.4-release"},"nextItem":{"title":"K\xf9zu 0.0.3 Release","permalink":"/docusaurus/blog/kuzu-0.0.3-release"}},"content":"In this post, we\'ll walk through how to use K\xf9zu as a [Pytorch Geometric (PyG) _Remote Backend_](https://pytorch-geometric.readthedocs.io/en/latest/advanced/remote.html) to train a GNN model on very large graphs that do not fit on your machine\'s RAM. \\n\\n\\nLet\'s start with a quick overview of PyG Remote Backends: PyG Remote Backends are plug-in replacements for PyG\'s in-memory graph and feature stores, so they can be used seamlessly with the rest of the PyG interfaces to develop your GNN models. If a PyG Remote Backend is a disk-based storage system, such as K\xf9zu, PyG will fetch subgraphs from K\xf9zu, which stores and scans its data from disk, allowing you to train models on very large graphs for which PyG\'s in-memory storage would run out of memory and fail.\\n\\n\x3c!--truncate--\x3e\\n\\nAs you\'ll see, if you already have PyG models you have developed in Python, replacing PyG\'s default storage with K\xf9zu is extremely simple. ***It \\nconsists of loading your graph into K\xf9zu and then changing 1 line of code in your PyG model***. To demonstrate how simple this is and how it performs,\\nse will follow this [Sample Code](https://github.com/pyg-team/pytorch_geometric/tree/master/examples/kuzu/papers_100M) to demonstrate how to do this.\\nSo let\'s get to it!\\n\\n## Dataset, Predictive Task, and GNN Model\\n\\nLet\'s start by describing our graph dataset, our predictive task, and the GNN model we will use for the predictive task.\\n\\n**Dataset**: We will use the `ogbn-papers100M` dataset of ~100M nodes and ~2.5B edges from the [Open Graph Benchmark](https://ogb.stanford.edu/) (OGB). To find the dataset,\\nyou can search for \\"ogbn-papers100M\\" [here](https://ogb.stanford.edu/docs/nodeprop/). The dataset takes about 128GB of RAM when using PyG\'s default in-memory storage. The graph\'s nodes and edges model the following:\\n\\n_Nodes_ are papers that have these properties:\\n\\n- `ID`: an int64 node identifier\\n- `year`: the publication date of the paper (you can ignore this as it will not be used in our example but this property is part of the dataset)\\n- `x`: 128-dimensional node features (so 128-size float tensors)\\n- `y`: a numeric label indicating the category/field of the paper. These numbers indicate different [arXiv categories](https://arxiv.org/category_taxonomy) for\\n papers. Although the exact mapping is not important, you can think of these for example as 0 indicating \\"physics\\", 2 indicating \\"geometry\\" etc.\\n\\n_Edges/Relationships_ are citations between papers and do not contain any properties.\\n\\n**Predictive task:** Predict the `y` labels of nodes using the node features stored in the `x` properties.\\n\\n**GNN Model**: We will train a 3-layer GraphSage model that contains 5.6 million parameters to perform this predictive task. Our model is based on the implementation [here](https://github.com/mengyangniu/ogbn-papers100m-sage/tree/main). We picked this model because it was one of the better-performing models in the [PyG Leaderboard for the ogbn-papers100M dataset](https://ogb.stanford.edu/docs/leader_nodeprop/) (search \\"GraphSAGE_res_incep\\" under \\"Leaderboard for ogbn-papers100M\\") that we could develop using pre-existing layers in the PyG library (so we do not have to write any custom layers).\\n\\n## Step 1: Preliminaries and Loading ogbn-papers100M into K\xf9zu\\n\\nAs a preliminary, the [`prepare_data.py`](https://github.com/pyg-team/pytorch_geometric/blob/master/examples/kuzu/papers_100M/prepare_data.py) script in [Sample Code](https://github.com/pyg-team/pytorch_geometric/tree/master/examples/kuzu/papers_100M) generates four numpy files for each property of the papers: (i) `./ids.npy`; (ii) `./node_feat.npy` (storing `x` properties); (iii) `./node_year.npy`; and (iv) `./node_label.npy` (storing `y` labels). In addition, it will generate an `./edge_index.csv` file that stores the citation relationships. In the below code snippets, we will assume you have gone through those steps.\\n\\nLet\'s start with how you load the `ogbn-papers100M` dataset into K\xf9zu. You will first need to define a `paper` NODE TABLE and a `cite` REL TABLE, whose schemas will follow exactly the structure of the dataset and then use `COPY FROM` statements in K\xf9zu\'s version of Cypher to ingest those numpy and csv files into your `paper` and `cite` tables:\\n\\n```\\n...\\nimport kuzu\\nimport numpy as np\\n...\\n\\nprint(\\"Creating an empty K\xf9zu database under the papers100M directory...\\")\\ndb = kuzu.Database(\'papers100M\')\\nconn = kuzu.Connection(db, num_threads=cpu_count())\\nprint(\\"Creating K\xf9zu tables...\\")\\nconn.execute(\\n \\"CREATE NODE TABLE paper(id INT64, x FLOAT[128], year INT64, y FLOAT, \\"\\n \\"PRIMARY KEY (id));\\")\\nconn.execute(\\"CREATE REL TABLE cites (FROM paper TO paper, MANY_MANY);\\")\\nprint(\\"Copying nodes to K\xf9zu tables...\\")\\nconn.execute(\'COPY paper FROM (\\"%s\\", \\"%s\\", \\"%s\\", \\"%s\\") BY COLUMN;\' %\\n (\'./ids.npy\', \'./node_feat.npy\', \'./node_year.npy\', \'./node_label.npy\'))\\nprint(\\"Copying edges to K\xf9zu tables...\\")\\nconn.execute(\'COPY cites FROM \\"%s\\";\' % (\'./edge_index.csv\'))\\nprint(\\"All done!\\")\\n```\\n\\nThe one important note here is that you should store your node features using [K\xf9zu\'s FIXED-LIST data type](https://kuzudb.com/docs/cypher/data-types/list.html) using `FLOAT[128]` syntax (instead of the less efficient VAR-LIST data type, which uses `FLOAT[]` syntax for lists that can have different lengths). FIXED-LIST is a data type that we specifically added to K\xf9zu to efficiently store node features and embeddings in graph ML applications.\\n\\n## Step 2: Get K\xf9zu Remote Backend by Calling `db.get_torch_geometric_remote_backend()`\\n\\nAfter loading your data to K\xf9zu, the only thing you have to do is to call the `get_torch_geometric_remote_backend()` function on your Database object `db`:\\n\\n```\\nfeature_store, graph_store = db.get_torch_geometric_remote_backend(multiprocessing.cpu_count())\\n```\\n\\nThis function returns two objects that implement PyG\'s Remote Backend interfaces: (i) `feature_store` is an instance of [`torch_geometric.data.FeatureStore`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.FeatureStore.html#torch_geometric.data.FeatureStore); and (ii) `graph_store` is an instance of [`torch_geometric.data.GraphStore`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.GraphStore.html#torch_geometric.data.GraphStore). These two handles are your K\xf9zu Remote Backends that you can pass to your PyG models/subgraph samplers and they will make your existing PyG models work seamllessly with K\xf9zu! That\'s all\\nyou really have to know about how to use K\xf9zu as a Remote Backend. ***There is no more K\xf9zu functions you have to call in the rest of the demonstration. You only have\\nto do 1 line of code change in your regular PyG code.***\\nThe rest of the example contains standard code you normally write to develop your PyG models.\\n\\n## Step 3: Define & Pass K\xf9zu\'s `feature_store` and `graph_store` to your GNN Model\\n\\nFirst, we\'ll define the GraphSage model in PyG. We\'ll put `...`\'s here and there to shorten the example because, as we said above, this is your regular PyG code:\\n\\n```\\n# Define the model for training. The model is ported from\\n# https://github.com/mengyangniu/ogbn-papers100m-sage\\nclass SAGE(nn.Module):\\n def __init__(self, in_feats, n_hidden, n_classes, n_layers, activation,\\n dropout):\\n super().__init__()\\n self.n_layers = n_layers\\n ...\\n\\n def forward(self, edge_list, x):\\n ...\\n for layer_index, layer in enumerate(self.layers):\\n ....\\n return self.mlp(collect)\\n```\\n\\nNext, we will enable PyG to use K\xf9zu\'s Remote Backend when training. We create a [`torch_geometric.loader.NeighborLoader`](https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/loader/neighbor_loader.html), which is the subgraph sampler we will use, and pass the `feature_store` and `graph_store` we obtained from K\xf9zu to it. ***This is the 1 line change you have to do!***\\n\\n```\\n# Plug the graph store and feature store into the NeighborLoader\\nkuzu_sampler = NeighborLoader(\\n data=(feature_store, graph_store),\\n num_neighbors={(\'paper\', \'cites\', \'paper\'): [12, 12, 12]},\\n batch_size=LOADER_BATCH_SIZE,\\n input_nodes=(\'paper\', input_nodes),\\n num_workers=4,\\n filter_per_worker=False,\\n)\\n```\\n\\n**`data=(feature_store, graph_store)`** is the important line. When you use this sampler in training to construct mini-batches, it will perform subgraph sampling and load the required node features from K\xf9zu automatically and return a [`torch_geometric.data.HeteroData`](https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.data.HeteroData.html) object, which can be directly plugged into a GNN model. That training code looks like this (again abbreviated because this is all PyG code):\\n\\n```\\nmodel = SAGE(128, 1024, 172, 3, torch.nn.functional.relu, 0.2)\\n...\\noptimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)\\ncriterion = torch.nn.CrossEntropyLoss()\\n\\nfor epoch in range(NUM_EPOCHS):\\n i = 0\\n start_time = time.time()\\n // **The below for loop line is where we ask the sampler to\\n // sample a mini batch\\n for b in kuzu_sampler:\\n x = b[\'paper\'][\'x\']\\n y = b[\'paper\'][\'y\']\\n edge_index = b[\'paper\', \'cites\', \'paper\'].edge_index\\n ...\\n model.train()\\n optimizer.zero_grad()\\n out = model(edge_index, x)\\n loss = criterion(out, y)\\n loss.backward()\\n optimizer.step()\\n ...\\n i += 1\\n```\\n\\n`for b in kuzu_sampler:` is the exact line where the sampler will end up calling on K\xf9zu to sample a subgraph and scan the features of the nodes in that subgraph. This all ends up using K\xf9zu\'s disk-based storage, allowing you to train GNNs on graphs that don\'t fit on your RAM. One distinct advantage of K\xf9zu is that, because it is an embeddable DBMS, \\nwe can do the conversion of scanned node features from K\xf9zu into PyG\'s tensors as a zero-copy operation. We simply write the scanned node features into a buffer array allocated in Python without any additional data transfer between the systems.\\n\\nCurrently, only the `feature_store` scans data from K\xf9zu\'s disk-based storage. For `graph_store`, our current implementation stores the entire graph topology in COO format in memory. This does limit how much you can scale, but in many models trained on large graphs, features take up more space than the graph topology, so scaling node features out of memory should still allow you to scale to very lage graphs that won\'t fit in your RAM.\\n\\n### Adjusting K\xf9zu\'s Buffer Pool Size\\n\\nAs with most DBMSs, K\xf9zu has a Buffer Manager that maintains a buffer pool to keep parts of the database in memory. When you use K\xf9zu, you decide how much memory to allocate to it. The more memory you give to K\xf9zu, the less I/O it will perform on scans. So, in the context of this post, the larger the buffer manager size you set, the faster your training time will be when training large graphs out of memory. You set K\xf9zu\'s buffer pool size when you construct your `Database` object, before you call the `get_torch_geometric_remote_backend()` function. For example, the code below sets the BM size to `40 * 1024**3` bytes, which is equal to 40GB. You should set it as high as possible without running out of memory for performance reasons.\\n\\n```\\nKUZU_BM_SIZE = 40 * 1024**3\\n# Create kuzu database\\ndb = kuzu.Database(\\"papers100M\\", KUZU_BM_SIZE)\\nfeature_store, graph_store = db.get_torch_geometric_remote_backend(\\n mp.cpu_count())\\n```\\n\\n## An Experiment Demonstrating Throughput Numbers With Different Buffer Pool Sizes\\n\\nLet\'s demonstrate what troughput numbers you can expect under different memory settings.\\nAs a baseline we will first measure the throughput of training\\nas time/batch using PyG\'s default in-memory\\nstorage. This seting uses ~106GB of memory.\\nWe will then simulate limited memory settings by training the same\\nmodel using K\xf9zu Remote Backend and limiting K\xf9zu\'s buffer pool size to\\ndifferent levels.\\nHere are the important configurations for the experiment:\\n\\n- Available RAM in the machine: 384GB RAM\\n- CPU: Two Xeon Platinum 8175M (48 cores/96 threads)\\n- GPU: RTX 4090 with 24GB GPU memory\\n- SSD in the system for disk storage: 2TB Kingston KC3000 NVMe SSD\\n- Mini-batch size: 1152. Recall the `kuzu_sampler = NeighborLoader(...)` that we defined above. There we gave this argument\\n `num_neighbors={(\'paper\', \'cites\', \'paper\'): [12, 12, 12]}` to the `NeighborLoader`, which means that the sampler will sample 3-degree neighbors of these 1152 nodes,\\n sampling 12 neighbors at each degree.\\n We picked 1152 as our mini-batch size because this is the size at which we generate batches that take a peak of 23GB of memory, so beyond this we would run out of GPU memory. [^1]\\n- \\\\# PyG Workers: 16 (we did a parameter sweep and setting this to 4, 8, 16 perform very similarly)\\n- \\\\# K\xf9zu Query Processor Threads: 24 (48 and 96 also perform similarly)\\n\\nWe will run K\xf9zu with 60GB, 40GB, 20GB, and 10GB buffer pool size.\\nThe lower K\xf9zu\'s buffer pool size, the more\\ndisk I/Os K\xf9zu will perform. Note however that in this experiment K\xf9zu will use more memory than\\nthese sizes for two reasons: (i) K\xf9zu stores some parts of the database always in memory\\nthough this is not very important in this setting; (ii) As we said, currently\\nK\xf9zu Remote Backend uses in-memory storage for the graph topology (but not node features!),\\nwhich takes ~48GB of RAM. So you can roughly think of K\xf9zu using 48 + BM size in these experiments.\\n\\nWe will do 500 batches of training and report the throughput number as average end-to-end time/batch.\\nWe also report the time that\'s spent on GPU for Training as `Training Time (s)` and\\ntime spent on copying data from CPU to GPU as `CPU-to-GPU Copying Time (s)`. For\\nK\xf9zu configurations, you can roughly\\ninterpret `Per Batch Time (s) - Training Time (s) - CPU-to-GPU Copying Time (s)`\\nas the time spent for scanning data from K\xf9zu into CPU\'s memory. We expect that to increase\\nas we lower the BM size.\\n\\n| Configuration | Per Batch Time (s) | Training Time (s) | CPU-to-GPU Copying Time | Time Scanning Data from K\xf9zu | Memory Usage |\\n| ----------------------------- | ------------------ | ----------------- | ----------------------- | ---------------------------- | ------------ |\\n| PyG In-memory | 0.281 | 0.240 | 0.024 | --- | ~110 GB |\\n| K\xf9zu Remote Backend (bm=60GB) | 0.380 (1.35x) | 0.239 | 0.018 | 0.123 | ~110 GB |\\n| K\xf9zu Remote Backend (bm=40GB) | 0.513 (1.82x) | 0.239 | 0.022 | 0.251 | ~90 GB |\\n| K\xf9zu Remote Backend (bm=20GB) | 1.162 (4.88x) | 0.238 | 0.022 | 0.901 | ~70 GB |\\n| K\xf9zu Remote Backend (bm=10GB) | 1.190 (4.23x) | 0.238 | 0.022 | 0.930 | ~60 GB |\\n\\nSo, when have enough memory, there is about 1.35x slow down (from 0.281s to 0.380s per batch)\\ncompared to using PyG\'s default storage. This\\nis the case when K\xf9zu has enough buffer memory (60GB) to store the features but we still incur the cost of\\nscanning them through K\xf9zu\'s buffer manager. So no disk I/O happens (except the first time\\nthe features are scanned to the buffer manager). When we use 40GB of buffer pool and below, we start doing some I/O,\\nand the average time per batch degrade to 0.513, 1.162, amd 1.190 respectively when using 40GB, 20GB, and 10GB.\\nWe seem to stabilize around 4x degradation at 10GB or 20GB level, where most of the feature scans\\nare now happening from disk. These numbers hopefully look good for many settings!\\n\\n## Next Steps\\n\\nWe will be doing 2 immediate optimizations in the next few releases\\nrelated to K\xf9zu\'s PyG integration.\\nFirst, we will change our `graph_store` to use an in DBMS subgraph sampler, so we can virtually work at any limited memory level.\\nSecond, in an even earlier release, we had a more basic PyG integration feature, the\\n[`QueryResult.get_as_torch_geometric()`](https://kuzudb.com/docs/client-apis/python-api/query-result.html#query_result.QueryResult.get_as_torch_geometric) function.\\nThis feature is more of an ETL feature. It is designed for cases where you want to filter\\na subset of your nodes and edges and convert them directly into PyG `HeteroData` objects (i.e., use PyG\'s default in-memory storage)\\nas you build PyG pipelines using graph databases you store in K\xf9zu.\\nIf you are converting a large graph this can be quite slow, and we will be improving this so that such ETL pipelines\\nare much faster!\\n\\nWe are excited to hear about your feedback on K\xf9zu\'s PyG integration features and get more ideas about\\nhow else we can help users who are building GNN pipelines. Please reach out to us over [K\xf9zu Slack](https://join.slack.com/t/kuzudb/shared_invite/zt-1w0thj6s7-0bLaU8Sb~4fDMKJ~oejG_g)\\nfor your questions and ideas!.\\n\\n[^1]:\\n If you read our [v0.0.3 blog post](https://kuzudb.com/blog/kuzu-0.0.3-release.html#k%C3%B9zu-as-a-pyg-remote-backend),\\n which had a shorter section about PyG interface, you will notice that we used a much larger batch size there (48000),\\n which was the size that saturated GPU memory. Although the example there was also on the `ogbn-papers100M` dataset, we used a much smaller model with ~200K parameters\\n and sampled subgraphs from 2 degree neighbors of these batches. Now we use a much larger model with 5.6 million parameters and samples from 3-degree neighbors."},{"id":"kuzu-0.0.3-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.3-release","source":"@site/blog/2023-04-06-kuzu-v-0.0.3.md","title":"K\xf9zu 0.0.3 Release","description":"We are happy to release K\xf9zu 0.0.3 today. This release comes with the following new main features and improvements:","date":"2023-04-06T00:00:00.000Z","formattedDate":"April 6, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":10.44,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.3-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"Scaling Pytorch Geometric GNNs With K\xf9zu","permalink":"/docusaurus/blog/kuzu-pyg-remote-backend"},"nextItem":{"title":"Why (Graph) DBMSs Need New Join Algorithms: The Story of Worst-case Optimal Join Algorithms","permalink":"/docusaurus/blog/wcoj"}},"content":"We are happy to release K\xf9zu 0.0.3 today. This release comes with the following new main features and improvements:\\n- [K\xf9zu as a Pytorch Geometric (PyG) Remote Backend](2023-04-06-kuzu-v-0.0.3.md#k\xf9zu-as-a-pyg-remote-backend): You can now train PyG GNNs and other models directly using graphs (and node features) stored on K\xf9zu. See this [Colab notebook](https://colab.research.google.com/drive/12fOSqPm1HQTz_m9caRW7E_92vaeD9xq6)\\nfor a demonstrative example. \\n- [Data ingestion from multiple files and numpy files](2023-04-06-kuzu-v-0.0.3.md#data-ingestion-improvements): See below for details\\n- [Query optimizer improvements](2023-04-06-kuzu-v-0.0.3.md#query-optimizer-improvements): See below for details\\n- [New buffer manager](2023-04-06-kuzu-v-0.0.3.md#new-buffer-manager): A new state-of-art buffer manager based on [VMCache](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf).\\n- [INT32, INT16, FLOAT, and FIXED LIST data types](2023-04-06-kuzu-v-0.0.3.md#new-data-types) (the latter is particularly suitable to store node features in graph ML applications)\\n- [Query timeout mechanism and interrupting queries from CLI](2023-04-06-kuzu-v-0.0.3.md#other-system-functionalities).\\n\\n\x3c!--truncate--\x3e\\n\\nFor installing the new version, \\nplease visit the [download section of our website](https://kuzudb.com/#download) \\nand [getting started guide](https://kuzudb.com/docs/getting-started.html) and the full\\n[release notes are here](https://github.com/kuzudb/kuzu/releases). Please visit\\nthe [Colab Notebooks](https://kuzudb.com/docs/getting-started/colab-notebooks) section of our\\ndocumentation website to play with our [Colab notebooks](https://kuzudb.com/docs/getting-started/colab-notebooks).\\n\\n\\nEnjoy! Please give us a try, [a Github \u2b50](https://github.com/kuzudb/kuzu) and your feedback and feature requests! Also follow\\nus on [Twitter](https://twitter.com/kuzudb)!\\n\\n## K\xf9zu as a PyG Remote Backend\\nK\xf9zu now implements PyG\'s Remote Backend interface. So you can directly \\ntrain GNNs using K\xf9zu as your backend storage. Quoting [PyG documentation\'s](https://pytorch-geometric.readthedocs.io/en/latest/advanced/remote.html) description\\nof the Remote Backend feature:\\n\\n> ...[this feature enables] users to train GNNs on graphs far larger than the size of their\\nmachine\u2019s available memory. It does so by introducing simple, easy-to-use, and extensible abstractions of a `torch_geometric.data.FeatureStore` and a `torch_geometric.data.GraphStore` that plug directly into existing familiar PyG interfaces.\\n\\nWith our current release, once you store your graph and features in K\xf9zu,\\nPyG\'s samplers work seamlessly using K\xf9zu\'s implementation of `FeatureStore` and `GraphStore` interfaces. For example, \\nthis enables your existing GNN models to work seamlessly by fetching both subgraph samples and node features\\nfrom K\xf9zu instead of PyG\'s in-memory storage. \\nTherefore you can train graphs that do not\\nfit into your memory since K\xf9zu, as a DBMS, stores its data on disk. Try this demonstrative [Colab notebook](https://colab.research.google.com/drive/12fOSqPm1HQTz_m9caRW7E_92vaeD9xq6) to \\nsee an example of how to do this. The current release comes with a limitation that we only truly implement the `FeatureStore` interface.\\nInside `GraphStore` we still store the graph topology in memory. \\nSo in reality only the features are stored and scanned from disk. We plan to address this limitation later on.\\n\\nHere is also a demonstrative experiment (but certainly not comprehensive study) for the type of training performance \\nvs memory usage tradeoff you can expect. \\nWe trained a simple 3-layers Graph Convolutional Network (GCN) model on [ogbn-papers100M](https://ogb.stanford.edu/docs/nodeprop/#ogbn-papers100M) dataset, which contains about 111 million nodes\\nwith 128 dimensional node features and about 1.6 billion edges. \\nStoring the graph topology takes around 48GB[^1] and the features takes 53 GBs. Given our current limitation,\\nwe can reduce 53 GB to something much smaller (we will limit it to as low as 10GB).\\nWe used a machine with one RTX 4090 GPU with 24 GB of memory, two Xeon Platinum 8175M CPUs, and 384 GB RAM, which \\nis enough for PyG\'s in-memory store to store the entire graph and all features in memory.\\n\\nDuring training, we use the `NeighborLoader` of PyG with batch size of 48000 and sets the `num_neighbors` to `[30] * 2`, which means at each batch roughly 60 neighbor nodes of 48000 nodes will be sampled from the `GraphStore` and the features of those nodes will be scanned\\nfrom K\xf9zu\'s storage. We picked this sample size because this gives us a peak GPU memory usage of approximately 22 GB, i.e.,\\nwe can saturate the GPU memory. We used 16 cores[^2] during the sampling process. We run each experiment in a Docker instance\\nand limit the memory systematically from 110GB, which is enough for PyG to run completely in memory, down to 90, 70, and 60GB.\\nAt each memory level we run the same experiment by using K\xf9zu as a Remote Backend, where we \\nhave to use about 48GB to store the topology and give the remaining memory to K\xf9zu\'s buffer manager.\\nFor example when the memory is 60GB, we can only give ~10GB to K\xf9zu.\\n\\n| Configuration | End to End Time (s) | Per Batch Time (s) | Time Spent on Training (s) | Time Spent on Copying to GPU (s) | Docker Memory | \\n|-------------------------------|-----------------|-----------------|------------------------|------------------------------|-------------|\\n| PyG In-memory | 140.17 | 1.4 | 6.62 | 31.25 | 110 GB |\\n| K\xf9zu Remote Backend (bm=60GB) | 392.6 | 3.93 | 6.29 | 34.18 | 110 GB | \\n| K\xf9zu Remote Backend (bm=40GB) | 589.0 | 5.89 | 6.8 | 32.6 | 90 GB | \\n| K\xf9zu Remote Backend (bm=20GB) | 1156.1 | 11.5 | 6.0 | 36 | 70 GB | \\n| K\xf9zu Remote Backend (bm=10GB) | 1121.92 | 11.21 | 6.88 | 35.03 | 60 GB | \\n\\nSo, when have enough memory, there is about 2.8x slow down (from 1.4s to 3.93s per batch). This\\nis the case when K\xf9zu has enough buffer memory (60GB) to store the 53GB of features but we still incur the cost of \\nscanning them through K\xf9zu\'s buffer manager. So no or very little disk I/O happens (except the first time\\nthe features are scanned to the buffer manager). Then as we lower the memory, K\xf9zu can hold only part \\nof the of node features in its buffer manager, so\\nwe force K\xf9zu to do more and more I/O. The per batch time increase to 5.89s at 40GB of buffer manager size, \\nthen seems to stabilize around 11s (so around 8.2x slowdown). \\n\\nThe slow down is better if you use smaller batch sizes but for the end to end training time, you\\nshould probably still prefer to use larger batch sizes. This is a place where we would need to\\ndo more research to see how much performance is on the table with further optimizations.\\n\\nBut in summary, if you have \\nlarge datasets that don\'t fit on your current systems\' memories and would like to easily train your PyG models \\noff of disk (plus get all the usability features of a GDBMS as you prepare your datasets for training), \\nthis feature can be very useful for you!\\n\\n## Data Ingestion Improvements\\n\\n**Ingest from multiple files**: You can now load data from multiple files of the same type into a node/rel table in two ways:\\n - **file list**: `[\\"vPerson0.csv\\", \\"vPerson1.csv\\", \\"vPerson2.csv\\"]`\\n - **glob pattern matching**: Similar to Linux [Glob](https://man7.org/linux/man-pages/man7/glob.7.html), this will load files that matches the glob pattern.\\n\\n**Ingest from npy files**: We start exploring how to enable data ingesting in column by column fashion. Consider a `Paper` table defined in the following DDL.\\n```\\nCREATE NODE TABLE Paper(id INT64, feat FLOAT[768], year INT64, label DOUBLE, PRIMARY KEY(id));\\n```\\nSuppose your raw data is stored in npy formats where each column is represented as a numpy array on disk:\\n\\"node_id.npy\\", \\"node_feat_f32.npy\\", \\"node_year.npy\\", \\"node_label.npy\\".\\nYou can now directly copy from npy files where each file is loaded to a column in `Paper` table as follows:\\n```\\nCOPY Paper FROM (\\"node_id.npy\\", \\"node_feat_f32.npy\\", \\"node_year.npy\\", \\"node_label.npy\\") BY COLUMN;\\n```\\n\\n**Reduce memory consumption when ingesting data into node tables:**\\nThis release further optimizes the memory consumption during data ingestion of node tables.\\nWe no longer keep the whole node table in memory before flushing it to disk as a whole. Instead, we process a chunk of a file\\nand flush its corresponding pages, so incur only the memory cost of ingesting a chunk (or as many chunks as there are threads running).\\nThis greatly reduces memory usage when the node table is very large.\\n\\n## Query Optimizer Improvements\\n\\n**Projection push down for sink operator**:\\nWe now push down projections down to the first sink operator \\nabove the last point in a query plan they are needed.\\nConsider the following query\\n```\\nMATCH (a:person) WHERE a.age > 35 RETURN a.salary AS s ORDER BY s;\\n```\\nThis query\'s (simplified) plan is: `Scan->Filter->OrderBY->ResultCollector`, where both \\n`ORDER BY` and the final `ResultCollector` are sink operators. \\n`ResultCollector` is where we accumulate the expressions in the `RETURN` clause. \\nThis is simplified because `ORDER BY` actually consists of several physical operators. \\nBoth column `age` and `salary` are scanned initially but only `salary` is needed in `ResultCollector`. \\n`age`, which is needed by `Filter` is projected out in the `ResultCollector`. We now push the projection of `age`\\nto `ORDER BY`, so `ORDER BY` does not have to materialize it.\\n\\n**Other optimizations:** We implemented several other optimizations, such as we reorder the filter expressions so equality conditions\\nare evaluated first, several improvements to cardinality estimator, and improved sideway information passing for joins. For the latter, \\nin our core join operator, which we called ASP-Joins in our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf), we would blindly\\nperform sideways information passing (sip) from build to probe (or vice versa; \\nsee [our paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) for details). Sometimes if there is no \\nfilters on the probe and build sides, this is just an overhead as it won\'t decrease the amount of scans on either side. \\nIn cases where we think sip won\'t help reduce scans, we do vanilla Hash Joins now.\\n\\n## New Buffer Manager\\n\\nBefore this release, we had two internal buffer pools with 2 different frame sizes of 4KB and 256KB,\\nso operators could only grab buffers of these two sizes. Plus when you loaded your DB and wanted to allocate\\nsay 10GB buffer pool, we manually gave a fixed percentage to 4KB pool and the rest to 256KB pool. \\nThis didn\'t give any flexibility for storing large objects and complicated code to manage \\nbuffers when operators needed them. Terrible design; \\njust don\'t do this!\\n\\nWe bit the bullet and decided to read the literature and pick a state-of-art buffer manager design that is\\nalso practical. We switched to the mmap-based approach described in VMCache design from [this recent paper](https://www.cs.cit.tum.de/fileadmin/w00cfj/dis/_my_direct_uploads/vmcache.pdf) by Leis et al.. \\nThis is a very nice design \\nand makes it very easy to support multiple buffer sizes very easily and only uses hardware locks (we used \\nsoftware locks in our previous buffer manager). It also supports using optimistic reading,\\nwhich we verified improves our query performance a lot.\\n\\n## New Data Types\\n\\nWe now support several additional data types that were missing.\\n\\n**[FIXED-LIST](https://kuzudb.com/docs/cypher/data-types/list.html) data type:** This is important if you\'re doing graph ML and storing node features\\nin K\xf9zu. It is the efficient way to store fixed-length vectors. Here\'s the summary of how\\nto declare a node or rel property in your schemas to use the fixed-list data type.\\n\\n| Data Type | Description | DDL definition |\\n| --- | --- | --- | \\n| FIXED-LIST | a list of fixed number of values of the same numerical type | INT64[8] |\\n\\nWhen possible use FIXED LIST instead of regular [VAR-LIST](https://kuzudb.com/docs/cypher/data-types/list.html) data type\\nfor cases when you know the size of your lists/vectors. It\'s much more efficient.\\n\\nNote that FIXED-LIST is an experimental feature. Currently only bulk loading (e.g. `COPY` statement) and reading is supported.\\n\\n**INT32, INT16, FLOAT data types:** The release also comes with support for the following data types:\\n\\n| Data Type | Size | Description |\\n| --- | --- | --- |\\n| INT32| 4 bytes | signed four-byte integer |\\n| INT16| 2 bytes | signed two-byte integer |\\n| FLOAT | 4 bytes | single precision floating-point number |\\n\\nFor our next release, our focus on data types will be on complex ones, STRUCT and MAP. So stay tuned for those!\\n\\n## Other System Functionalities\\n\\n**Query timeout**: We will now automatically stop any query that exceeds a specified timeout value (if one exists). \\nThe default query timeout value is set to -1, which signifies that the query timeout feature is initially disabled. \\nYou can activate the query timeout by configuring a positive timeout value through:\\n - 1. C++ API: `Connection::setQueryTimeOut(uint64_t timeoutInMS)`\\n - 2. CLI: `:timeout [timeoutValue]`\\n\\n**Interrupt:** You can also interrupt your queries and can stop your long running queries manually. There\\nare two ways to do this:\\n - C++ API: `Connection::interrupt()`: interrupt all running queries within the current connection.\\n - CLI: interrupt through `CTRL + C`\\n\\nNote: The Interruption and Query Timeout features are not applicable to `COPY` commands in this release.\\n\\n[^1]: Internally, PyG coverts the edge list to CSC format for sampling, which duplicates the graph structures in memory. When you download the graph topology it actually takes about 24GB.\\n[^2]: We set `num_workers` to 16 when running the PyG in-memory setup. Since K\xf9zu does not currently work with multiple workers in Python, we limit `num_workers` to 1 when sampling from K\xf9zu but internally K\xf9zu scans in parallel with 16 threads."},{"id":"wcoj","metadata":{"permalink":"/docusaurus/blog/wcoj","source":"@site/blog/2023-02-22-wcoj/index.md","title":"Why (Graph) DBMSs Need New Join Algorithms: The Story of Worst-case Optimal Join Algorithms","description":"Joins of a sets of records is objectively the most expensive operation in DBMSs.","date":"2023-02-22T00:00:00.000Z","formattedDate":"February 22, 2023","tags":[{"label":"internals","permalink":"/docusaurus/blog/tags/internals"}],"readingTime":20.76,"hasTruncateMarker":true,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"wcoj","authors":["semih"],"tags":["internals"]},"prevItem":{"title":"K\xf9zu 0.0.3 Release","permalink":"/docusaurus/blog/kuzu-0.0.3-release"},"nextItem":{"title":"K\xf9zu 0.0.2 Release","permalink":"/docusaurus/blog/kuzu-0.0.2-release"}},"content":"import WcojRunningExDataImage from \'./wcoj-running-ex-data.png\';\\nimport WcojEdgeCoversImage from \'./wcoj-edge-covers.png\';\\nimport WcojGjSimulationImage from \'./wcoj-gj-simulation.png\';\\nimport WcojKuzuMultiwayHashJoinImage from \'./wcoj-kuzu-multiway-hash-join.png\';\\nimport Wcoj4CliqueImage from \'./wcoj-4-clique.png\';\\n\\n\\nJoins of a sets of records is objectively the most expensive operation in DBMSs.\\nIn my previous post on [factorization](../2023-01-20-factorization/index.md), I said that in the field of databases, once \\nin a while you run into a very simple idea that deviates from the norm that gets you very excited. \\nToday, I will discuss another such idea, worst-case optimal join (wcoj) algorithms. \\nWcoj algorithms and the theory around it in one sentence says this:\\n\\n - Queries involving complex \\"cyclic joins\\" over many-to-many relationships should be \\n evaluated column at a time instead of table at a time, which is the norm. \\n\\n\\nWcoj algorithms find their best applications when finding cyclic patterns on graphs, \\nsuch as cliques or cycles, which is common in the workloads of fraud detection and\\nrecommendation applications. As such, they should be integrated into every graph DBMS \\n(and possibly to RDBMSs) and I am convinced that they eventually will.\\n\\n\x3c!--truncate--\x3e\\n\\n:::tip Tldr: The key takeaways are:\\n- **History of Wcoj Algorithms:** Research on wcoj algorithms started with a solution to open question \\n about the maximum sizes of join queries. This result made researchers realize this: the traditional \\n \\"binary join plans\\" paradigm of generating query plans that join 2 tables a time\\n until all of the tables in the query are joined is provably\\n suboptimal for some queries. Specifically, when join queries are\\n cyclic, which in graph terms means when the searched graph pattern has cycles\\n in it, and the relationships between records are many-to-many, then this \\n paradigm can generate unnecessarily large amounts of intermediate results.\\n- **Core Algorithmic Step of Wcoj Algorithms:** Wcoj algorithms fix this sub-optimality by \\n performing the joins one column at a time (instead of 2 tables at a time) using multiway intersections.\\n- **How K\xf9zu Integrates Wcoj Algorithms:** K\xf9zu generates plans that seamlessly mix binary joins \\n and wcoj-style multiway intersections. Multiway intersections are performed by an operator called \\n \\"multiway HashJoin\\", which has one or more build phases that creates one or more hash tables that stores\\n sorted adjacency lists; and a probe phase that performs multi-way intersections using the sorted lists.\\n- **Yes, the Term \\"Worst-case Optimal\\" Is Confusing Even to Don Knuth:** I know, Don Knuth also found the term\\n \\"worst-case optimal\\" a bit confusing. See my [anecdote on this](#a-thank-you--an-anecdote-about-knuths-reaction-to-the-term-worst-case-optimal). \\n It basically means that the worst-case runtimes of these algorithms are asymptotically optimal.\\n:::\\n\\n## Joins, Running Example & Traditional Table-at-a-time Joins\\nJoins are objectively the most expensive and powerful operation in DBMSs.\\nIn SQL, you indicate them in the FROM clause by listing\\na set of table names, in Cypher in the MATCH clause, where you draw a graph pattern\\nto describe how to join node records with each other.\\nAs a running example, consider a simple social network of users and followers, \\nwhose node-link diagram is shown below. I am also showing the table that contains these records \\nin a `User` (ignore the `name` property for now) and `Follows` tables.\\n\\n\\n\\n
\\n\\n
\\n\\nConsider finding triangles, which is one of the simplest \\nforms of cycles and cliques, in this network. The SQL and Cypher \\nversions of this query are shown below. \\n\\n```\\nSQL:\\nSELECT *\\nFROM Follows f1, Follows f2, Follows f3\\nWHERE f1.dst=f2.src AND f2.dst=f3.src AND\\n f3.dst = f1.src\\n\\nCypher:\\nMATCH (a:User)-[f1:Follows]->(b:User)-[f2:Follows]->(c:User)-[f3:Follows]->(a)\\nRETURN *\\n```\\nThat long MATCH clause \\"draws\\" a triangle and for our case here, this is equivalent\\nto joining three copies of the Follows table. \\n\\nNow ever since the System R days and [Patricia Selinger\'s 1979 seminal paper](https://courses.cs.duke.edu/compsci516/cps216/spring03/papers/selinger-etal-1979.pdf) that \\ndescribed how System R compiled and optimized SQL queries, there has been an \\nunchallenged dogma in DBMSs that the joins specified in the query would be \\nevaluated pairwise, table at a time. \\nHere\'s a blurb from Selinger\'s paper, where one can see this \\nassumption: \\n\\"*In System R a user need not know how the\\ntuples are physically stored ... Nor does a user \\nspecify in what order joins are to be performed. The System\\nR optimizer chooses both join order and ...*\\"\\nTo this day, this is the norm. DBMSs pick a \\"join order\\" which is the order in \\nwhich the tables should be joined iteratively 2 at a time. \\nIn the above example, for example \\nthere are three possible join orders. One way to represent these orders is by \\nwriting different parenthesization of the joins: \\n- (i) $((F1 \\\\bowtie F2) \\\\bowtie F3)$; (ii) $(F1 \\\\bowtie (F2 \\\\bowtie F3))$; \\n and (iii) $((F1 \\\\bowtie F3) \\\\bowtie F2)$. \\n\\nThe optimization problem for a system is of course more complex than just \\nordering tables because the system also has to choose which\\nbinary join algorithm to use when joining each pair of tables, e.g., hash joins vs merge joins. \\nBut take any system you want, and they will all follow the same paradigm of \\njoining 2 base or intermediate tables iteratively, until all tables are joined: \\nhence the term *binary joins* to describe the plans of existing systems.\\n\\n\\n## A Math Puzzle That Started it All \\n\\nSo, what\'s the problem with binary join plans? When join queries are cyclic\\nand the relationships are many-to-many, they can generate provably large amounts\\nof (so unnecessary in a formal sense) intermediate results. First, cyclicity for\\njoin queries has formal (and a bit intimidating) definitions but if you think of\\ngraph patterns, it simply means that the searched pattern\'s undirected version has\\ncycles. Why do binary joins generate unnecessarily large intermediate results? I\'ll\\nget to this below but first a bit of history on the origins of this insight.\\nThe whole topic of \\"worst-case optimal joins\\" started with 2 papers, a [2007 SODA](https://arxiv.org/abs/1711.04506) \\nand a [2008 FOCS](https://arxiv.org/abs/1711.03860) \\npaper, which are top venues in algorithms and theory. In these papers,\\nseveral theoreticians solved a fundamental open question \\nabout join queries. Suppose I give you:\\n\\n1. An arbitrary natural join query, say of $m$ relations. In DBMS literature we denote such \\n queries as $Q=R1(a_{11}, ..., a_{r1}) \\\\bowtie ... \\\\bowtie Rm(a_{m1}, ..., a_{rm})$.\\n2. Sizes of R1, ..., Rm, e.g., for simplicity assume they all have $IN$ many tuples. \\n\\n\\"Natural\\" here means that the join predicates are equality predicates on identical column \\nnames. You, as the second person in this puzzle, are allowed to set the values inside these relations. \\n**The open question was: how large can you make the final output?** So for example, if I told you that there are\\n$IN$ many tuples in the `Follows` tables, what is the maximum number of triangle outputs there can be?[^1]\\nEven more concretely for the triangle query, the question is: out of all possible graphs with $IN$ many edges, \\nwhat is the maximum number of triangles they contain?\\n\\n
\\n\\n
\\n\\nIt still surprises me that the answer to this question was not known until 2008.\\nIt just looks like a fundamental question someone in databases must have answered before. \\nNow excuse me for bombarding your brains with some necessary math definitions.\\nThese two papers showed that the answer is: $IN^{\\\\rho^*}$, where $\\\\rho^*$ is a property \\nof $Q$ called the *fractional edge cover number* of $Q$. \\nThis is the solution to\\nan optimization problem and best explained by thinking about the \\"join query graph\\",\\nwhich, for our purposes, is the triangle graph pattern (ignoring the edge directions), shown\\nin Fig 2a and 2b.\\n\\nThe optimization problem is this: \\nput a weight between [0, 1] to\\neach \\"query edge\\" such that each \\"query node\\" is \\"covered\\", i.e., the sum of\\nthe query edges touching each query node is > 1. Each such solution is called an\\nedge cover. The problem is to find the edge cover whose total weight is the minimum. That is \\ncalled the fractional edge cover number of the query. For the triangle query, \\none edge cover, shown in Fig 2a, is [1, 1, 0], which has\\na total weight of 1 + 1 + 0 = 2. \\nThe minimum weight edge cover is [1/2, 1/2, 1/2], shown in Fig 2b, \\nwith a total weight of 1.5. Therefore, the fractional edge cover number $\\\\rho^*$\\nof the triangle query is 1.5.\\nIn general, each edge cover is an upper bound but the FOCS paper showed\\nthat the fractional edge cover number is the tight upper bound.\\nSo the maximum number of triangles there can be on a graph with $IN$ edges is $\\\\Theta(IN^{1.5})$ \\nand this is tight, i.e., there are such graphs. Nice scientific progress!\\nNowadays, the quantity $IN^{\\\\rho^*}$ is known as the `AGM bound` of a query,\\nafter the first letters of the last names of the authors of the FOCS paper.\\n\\n\\n## Problem With Table-at-a-time/Binary Joins\\nNow this immediately made the same researchers realize that binary join plans are \\nprovably sub-optimal because they can generate polynomially more intermediate results\\nthan the AGM bound of the query. This happens because on cyclic queries, \\nthe strategy of joining tables\\n2 at a time may lead to unnecessarily computing some acyclic sub-joins. \\nFor example, in the triangle query, the plan\\n$((F1 \\\\bowtie F2) \\\\bowtie F3)$ first computes $(F1 \\\\bowtie F2)$ sub-join,\\nwhich in graph terms computes the 2-paths in the graph.\\nThis is a problem because often there can be many more of these acyclic sub-joins\\nthan there can be outputs for the cyclic join. \\nFor this plan, there can\\nbe $IN^2$ many 2-paths (which is the AGM bound of 2-paths),\\nwhich is polynomially larger than $IN^{1.5}$. \\nFor example in our running example, there are 1000\\\\*1000 = 1M many 2 paths,\\nbut on a graph with 2001 edges there can be at most 89.5K triangles (well ours\\nhas only 3 triangles (because the triangle query we are using is symmetric \\nthe sole triangle would generate 3 outputs for 3 rotations of it)).\\n \\nAny other plan in this case would have generated $IN^2$ many 2-paths, \\nso there is no good binary join plan here. I want to emphasize that this sub-optimality does not occur \\nwhen the queries are acyclic or when the dataset does not have \\nmany-to-many relationships. If the joins were primary-foreign key non-growing joins, \\nthen binary join plans will work just fine. \\n\\n## Solution: Column-at-a-time \\"Worst-case Optimal\\" Join Algorithms\\n\\nSo the immediate\\nnext question is: are there algorithms whose runtimes can be bounded by \\n$O(IN^{1.5})$? If so, how are they different? The answer to this question\\nis a bit anti-climactic. The core idea existed in the 2007 SODA and 2008 FOCS papers,\\nthough it was refined more ~4 years later in some theoretical papers\\nby [Hung Ngo](https://hung-q-ngo.github.io/), [Ely Porat](https://u.cs.biu.ac.il/~porat/), \\n[Chris R\xe9](https://cs.stanford.edu/~chrismre/), and [Atri Rudra](https://cse.buffalo.edu/faculty/atri/) \\nin the database fields [PODS](https://dl.acm.org/doi/10.1145/2213556.2213565) and \\n[SIGMOD Record](https://dl.acm.org/doi/10.1145/2590989.2590991). The answer is simply\\nto perform the join column at a time, using multiway \\nintersections. \\"Intersections of what?\\" you should be asking. \\nFor joins over arbtrary relations, we need special indices but I want to\\nskip this detail.\\nIn the context of GDBMSs, GDBMSs already\\nhave join indices (aka adjacency list indices) and for the common joins\\nthey perform, this will be enough for our purposes.\\n\\nI will next demonstrate a wcoj \\nalgorithm known as \\"Generic Join\\" from the [SIGMOD Record paper](https://dl.acm.org/doi/10.1145/2590989.2590991). \\nIt can be seen as the simplest of all wcoj algorithms.\\nAs \\"join order\\", we will pick a \\"column order\\"\\ninstead of Selinger-style table order. So in our triangle query,\\nthe order could be a,b,c. Then we will build indices over each relation\\nthat is consistent with this order. In our case there are conceptually three (identical)\\nrelations: `Follows1(a, b)`, `Follows2(b, c)`, `Follows3(c, a)`. For `Follows1`,\\nwe need to be able to read all `b` values for a given `a` value (e.g., `a=5`).\\nIn graph terms, this just means that we need \\"forward join index\\".\\nFor `Follows3`, because `a` comes earlier than `c`, we will want an index\\nthat gives us `c` values for a given `a` value. This is equivalent to a\\n\\"backward join index\\". In graphs, because joins happen through the\\nrelationship records, which can, for the purpose of the joins, \\nbe taught of as a binary relation (src, dst), 2 indices is enough\\nfor our purposes. On general relations, one may need many more indices.\\n\\n
\\n\\n
\\n\\n\\nWe will iteratively find: (i) all `a` values\\nthat can be in the final triangles; (ii) all `ab`\'s that be in the final\\ntriangles; and (iii) all `abc`\'s, which are the triangles. Let\'s simulate the computation:\\n - Step 1: Find all `a`\'s. Here we will just take\\nall nodes as possible a values. This is shown under \\"Step 1\\" in the above figure.\\n- Step 2: For each a value, e.g., a=1, we extend it to find all `ab`\'s that \\ncan be part of triangles: Here we use the forward index to look up all\\n`b` values for node with ID 1. So on and so forth. This will generate the \\nsecond intermediate relation.\\n- Step 3: For each `ab` value, e.g., the tuple (a=1 b=0), we will\\nintersect all `c`\'s with `a`=1, and all `c`\'s with `b`=0. That is, we will intersect\\nthe backward adjacency list of the node with ID 1, and forward adjacency list of \\nthe node with ID 0. If the intersection is non-empty, we produce some triangles.\\nIn this case, we will produce the triangle (`a`=1, `b`=0, `c`=1001)\\nThe result of this computation will produce the third and final \\noutput table in the figure.\\n\\n\\nNote that this process did not produce the 2-paths as an intermediate step, \\nwhich is how wcoj algorithms fix for the sub-optimality of binary join algorithms.\\nIf your query was more complex then a wcoj algorithm can do k-way intersections where k > 2. \\nFor example on the 4-clique query shown on the right, suppose the \\ncolumn order is abcd, then given abc triangles, we would do a 3-way intersection of\\nforward index of a\'s, backward index of b\'s, and forward index of c\'s, to complete\\nthe triangles to joins. This type of multiway intersections is the necessary \\nalgorithmic step to be efficient on cyclic queries.\\n\\n\\n## How K\xf9zu Performs Worst-case Optimal Join Algorithms:\\n\\nOur [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) describes this in detail, so I will be brief here. \\nFirst, K\xf9zu mixes binary joins and wcoj-like multiway intersections\\nfollowing some principles that my PhD student [Amine Mhedhbi](http://amine.io/)\\nhad worked quite hard on early in his PhD. I recommend these two papers, \\none by [Amine and me](https://www.vldb.org/pvldb/vol12/p1692-mhedhbi.pdf)\\nand one by the [Umbra group](https://db.in.tum.de/~freitag/papers/p1891-freitag.pdf) \\non several different ways people have proposed for mixing binary and wcoj algorithms in query plans. \\nOverall message of these studies is that, wcoj are critical when the query has a very cyclic component\\nand multiway intersections can help. If the query does not have this property, \\nsystems should just use binary joins. \\nSo wcoj-like computations should be seen as complementing binary join plans.\\n\\n
\\n\\n
\\n\\n\\n\\nSecond, K\xf9zu performs multiway intersections in a *Multiway HashJoin* operator.\\nIn our CIDR paper we call this operator Multiway ASPJoin. It can\xa0be thought \\nof a modified hash-join operator where we use multiple hash tables and do \\nan intersection to produce outputs as I will simulate. \\nLet me change the query a little and add a filter on `a.name = Noura`,\\nwhere `name` is the primary key of `User` records. You can see from Fig 1a\\nthat Noura is the primary key of node with ID 1. In my simulation,\\nthe Multiway HashJoin operator will take `ab` tuples and extend them \\nto `abc` tuples through a 2-way intersection. In general multiway HashJoin\\nhas 3 phases: 1 accumulate phase, build phases to build k-2 hash tables, \\nand a probe phase. Here are the steps.\\n- Step 1 - Accumulate Phase: The operator receives the `ab` tuples which will be extended\\nto triangles. This allows the system to see exactly\\nthe forward/backward lists of which nodes will be intersected. Then, the operator passes \\nthis information sideways to only scan those lists. In this case,\\nbecause there is a primary key filter on Noura, the only `ab` tuple that will be read\\nis (a=1,b=0). This is stored in a temporary buffer that we call \\"Factorized Table\\" in the system.\\n- Step 2 - Build Phase 1: In the first build step, Multway HashJoin will pass a nodeID filter\\nto the `Scan Follows (a)<-(c)` operator with only 1=true for node ID 1, and 0 for every other node ID.\\nThe operator can do this because at this stage the operator knows exactly which backward\\nadjacency lists will be needed when we extend the tuple (in this case only node with ID 1\'s\\nbackward list is needed). The Scan operator uses this node ID filter to scan only this backward list, \\n{1001}, and avoids\\nscanning the rest of the file that stores the backwards Follows edges. This list is first sorted\\nbased on the IDs of the neighbor IDs and stored in a hash table, denoted as \\"Hash Table (a)<-(c)\\"\\nin the figure.\\n- Step 3 - Build Phase 2: This is similar to Build phase 1. Using a semijoin filter\\nwith node 0\'s ID, we scan only node 2\'s forward `Follows` list {1001, 1002, ..., 2000}, \\nsort it, and then store in a hash table \\"Hash Table (b)->(c)\\".\\n- Step 4 - Probe: We re-scan the accumulated `ab` tuples from the factorized table.\\nFor each tuple, we first probe \\"Hash Table (a)<-(c)\\" \\nand then \\"Hash Table (b)->(c)\\" to fetch two lists, intersect them, and produce outputs.\\nIn this case there is only one tuple (a=1, b=0), so we will fetch a=1\'s backward list and b=0\'s forward list,\\nintersect these lists, and produce the triangle (a=1, b=0, c=1001).\\n\\nThis performs quite well. Our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) has some performance numbers\\ncomparing against other types of WCO joins implementations (see the experiments in Table 3). Since I did not cover other ways to implement\\nwco join algorithms inside DBMSs, these experiments would be difficult to explain here.\\nInstead, let me just demonstrate some simple comparisons between using binary joins and wco joins\\nin K\xf9zu on a simple triangle query. On larger cyclic queries, e.g., 4- or 5- cliques, \\nthe differences are much larger and often binary join plans do not finish on time.\\nYou can try this experiment too. \\n\\nHere is the configuration. The dataset I\'m using\\nis a popular web graph that is used in academic papers called [web-BerkStan](https://snap.stanford.edu/data/web-BerkStan.html).\\nIt has 685K nodes and 7.6M edges.\\nI modeled these as a simple `Page` nodes and `Links` edges.\\n\\nI start K\xf9zu on my own laptop, which is a Macbook Air 2020 with Apple M1 chip, 16G memory,\\nand 512GB SSD, and run the following two queries (by default, K\xf9zu uses all thread available, which is 8 in this case):\\n\\n```\\n- Q1: K\xf9zu-WCO\\nMATCH (a:Page)-[e1:Links]->(b:Page)-[e2:Links]->(c:Page)-[e3:Links]->(a)\\nRETURN count(*)\\n```\\nThis will compile plan that uses a wco Multiway HashJoin operator. I will refer to this\\nplan as K\xf9zu-WCO below. I am also running the following query:\\n```\\n- Q2: K\xf9zu-BJ\\nMATCH (a:Page)-[e1:Links]->(b:Page)\\nWITH *\\nMATCH (b:Page)-[e2:Links]->(c:Page)\\nWIH *\\nMATCH (c)-[e3:Links]->(a)\\nRETURN count(*)\\n```\\n\\nCurrently K\xf9zu compiles each MATCH/WITH block separately so this is hack to force the system\\nto use binary join plan. The plan will join `e1` `Links` with `e2` `Links` and then\\njoin the result of that with `e3` `Links`, all using binary HashJoin operator. I will\\nrefer to this as K\xf9zu-BJ. Here are the results:\\n\\n| Configuration | Time |\\n|----------|:-------------:|\\n| K\xf9zu-WCO | 1.62s |\\n| K\xf9zu-BJ | 51.17s |\\n\\nThere are ~41M triangles in the output. We see **31.6x** performance improvement in this simple query. \\nIn larger densely cyclic queries, binary join plans just don\'t work.\\n\\nTo try this locally, you can download our prepared CSV files from [here](https://github.com/kuzudb/kuzudb.github.io/tree/main/data/web-berkstan), and compile from our [latest master](https://github.com/kuzudb/kuzu)[^2] (`make clean && make release NUM_THREADS=8`).\\nThen start K\xf9zu\'s shell, and load data into K\xf9zu:\\n```\\n./build/release/tools/shell/kuzu_shell -i web.db\\nkuzu> CREATE NODE TABLE Page (id INT64, PRIMARY KEY(INT64));\\nkuzu> CREATE REL TABLE Links (FROM Page TO Page, MANY_MANY);\\nkuzu> COPY Page FROM \'web-node.csv\';\\nkuzu> COPY Links FROM \'web-edge.csv\';\\n```\\nNow, run those two queries (K\xf9zu-WCO and K\xf9zu-BJ) to see the difference!\\n\\n## A Thank You & an Anecdote About Knuth\'s Reaction to the Term \\"Worst-case Optimal\\"\\n \\nBefore wrapping up, I want to say thank you to [Chris R\xe9](https://cs.stanford.edu/~chrismre/), who is a\\nco-inventor of earliest wcoj algorithms. \\nIn the 5th year of my PhD, Chris had introduced me to this area and \\nwe had written a paper together on the topic in the context of evaluating\\njoins in distributed systems, such as MapReduce and Spark. I ended up working on\\nthese algorithms and trying to make them performant in actual systems\\nfor many more years than I initially predicted. \\nI also want to say thank you to [Hung Ngo](https://hung-q-ngo.github.io/) and [Atri Rudra](https://cse.buffalo.edu/faculty/atri/),\\nwith whom I have had several conversations during those years on these algorithms.\\n\\nFinally, let me end with a fun story about the term \\"worst-case optimal\\": \\nSeveral years ago [Don Knuth](https://uwaterloo.ca/computer-science/events/dls-donald-knuth-all-questions-answered) was visiting UWaterloo\\nto give a Distinguished Lecture Seminar, which is our department\'s most prestigious \\nlecture series. A colleague of mine and I had a 1-1 meeting with him. \\nKnuth must be known to anyone with a CS degree but importantly he is\\ncredited for founding the field of algorithm analysis (e.g., for popularizing\\nthe big-oh notation for analyzing algorithms\' performances). \\nIn our meeting, he asked me what I was working on\\nand I told him about these new algorithms called \\"worst-case optimal join algorithms\\".\\nThe term was so confusing to him and his immediate interpretation \\nwas: \\"Are they so good that they are optimal even in their worst-case performances?\\" \\n\\nThe term actually means that the worst-case runtime of these algorithms\\nmeets a known lower bound for the worst-case runtime of any join algorithm,\\nwhich is $\\\\Omega(IN^{\\\\rho^*})$.\\nProbably a more standard term would be to call them \\n\\"asymptotically optimal\\", just like people call sort merge an asymptotically optimal \\nsorting algorithm under the comparison model.\\n\\n\\n## Final Words\\nWhat other fundamental algorithmic developments have\\nbeen made in the field on join processing? It is surprising but there are still main gaps\\nin the field\'s understanding of how fast joins can be processed. \\nThere has been some very interesting \\nwork in an area called *beyond worst-case optimal join algorithms*. These papers\\nask very fundamental questions about joins, such as how can we prove that a join algorithm\\nis correct, i.e., it produces the correct output given its input? \\nThe high-level answer is that each join algorithm must be producing a proof that its output is correct,\\nthrough the comparison operations it makes.\\nThe goal of this line of research is to design practical algorithms whose implicit proofs are optimal,\\ni.e., as small as possible. This is \\nprobably the most ambitious level of optimality one can go for in algorithm design.\\nThere are already some algorithms, e.g., an algorithm called [Tetris](https://dl.acm.org/doi/pdf/10.1145/2967101). The area\\nis fascinating and has deep connections to computational geometry. I\\nadvised a [Master\'s thesis](https://arxiv.org/abs/1909.12102) on the topic once and learned quite a bit about\\ncomputational geometry that I never thought could be relevant to my work. The current\\nbeyond worst-case optimal join algorithms however are currently not practical. \\nSome brave souls need to get into the space and think hard about whether \\npractical versions of these algorithms can be developed. That would be very exciting.\\n\\nThis completes my 3-part blog on the contents of our CIDR paper and 2 core techniques:\\n[factorization](../2023-01-20-factorization/index.md) and worst-case optimal join algorithms that we have integrated into\\nK\xf9zu to optimize for many-to-many joins. My goal in these blog\\nposts was to explain these ideas to a general CS/software engineering audience and\\nI hope these posts have made this material more approachable. My other goal\\nwas to show the role of theory in advancing systems. Both of these ideas emerged from\\npen-and-paper theory papers that theoreticians wrote but gave clear advice to DBMS developers.\\nAs I said many times, I\'m convinced that among many other techniques, these two \\ntechniques need to be integral to any GDBMS that wants to be competitive in performance,\\nbecause queries with many-to-many joins are first-class-citizens in the workloads of these systems.\\n\\nWe will keep writing more blog posts in the later months about our new releases,\\nand other technical topics. If there are things you\'d like us to write about,\\nplease reach out to us! Also please give K\xf9zu a try, prototype applications with it,\\nbreak it, let us know of your performance or other bugs, so we can continue improving\\nit. Give us a [GitHub star](https://github.com/kuzudb/kuzu) too and take care until the next posts!\\n\\n\\n[^1]: The question is interesting in the set semantics when you cannot pick every column value of every tuple the same value, which forces a Cartesian product of all the relations.\\n[^2]: We found a minor bug in the latest release 0.0.2 when a node has a very large number of edges, which is fixed in the master branch, that\'s why we suggest using the master branch."},{"id":"kuzu-0.0.2-release","metadata":{"permalink":"/docusaurus/blog/kuzu-0.0.2-release","source":"@site/blog/2023-02-13-kuzu-v-0.0.2.md","title":"K\xf9zu 0.0.2 Release","description":"This post is about the second release of K\xf9zu. However, we want to start with something much more important:","date":"2023-02-13T00:00:00.000Z","formattedDate":"February 13, 2023","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":6.33,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"kuzu-0.0.2-release","authors":["team"],"tags":["release"]},"prevItem":{"title":"Why (Graph) DBMSs Need New Join Algorithms: The Story of Worst-case Optimal Join Algorithms","permalink":"/docusaurus/blog/wcoj"},"nextItem":{"title":"Factorization & Great Ideas from Database Theory","permalink":"/docusaurus/blog/factorization"}},"content":"This post is about the second release of K\xf9zu. However, we want to start with something much more important:\\n\\n### Donate to the Victims of [T\xfcrkiye-Syria Earthquake](https://www.bbc.com/news/world-middle-east-64590946):\\nOur hearts, thoughts, and prayers go to all the victims, those who survived and those who passed,\\nin Syria and T\xfcrkiye. \\nThere will be a very difficult winter for all those who survived so everyone needs to help. \\nHere are two pointers for trustworthy organizations we know of that are trying to help\\nvictims on the ground. For T\xfcrkiye (where Semih is from), you can donate to [Ahbap](https://ahbap.org/bagis-kategorisi/5)\\n(Please be aware that **the donation currency is in TL** and 14 TL = 1 CAD; 19TL = 1 USD); and for Syria \\nyou can donate to the [White Helmets](https://www.whitehelmets.org/en/). Be generous! We\'ll leave pointers to several \\nother organizations below in this footnote[^1].\\n\\n\x3c!--truncate--\x3e\\n\\n## Overview of K\xf9zu 0.0.2\\nBack to our release. K\xf9zu codebase is changing fast but this release still has a focus: we \\nhave worked quite hard since the last release to integrate K\xf9zu to import data from\\ndifferent formats and export data to different formats. There are also several important \\nfeatures in the new Cypher clauses and queries we support, additional string \\nprocessing capabilities, and new DDL statement support. We will give a summary of each \\nof these below.\\n\\nFor installing the new version, please visit the [installation guide](https://kuzudb.com/docs/getting-started.html) and\\nthe full\\n[release notes are here](https://github.com/kuzudb/kuzu/releases). If you are eager to play with\\na few Colab notebooks, here are several links: \\n- [General K\xf9zu Demo](https://colab.research.google.com/drive/15OLPggnRSBmR_K9yzq6iAGE5MDzNwqoN)\\n- [Export Query Results to Pytorch Geometric: Node Property Prediction Example](https://colab.research.google.com/drive/1fzcwBwTY-M19p7OOTIaynfgHFcAQo9NK)\\n- [Export Query Results to Pytorch Geometric: Link Prediction Example](https://colab.research.google.com/drive/1QdX7CDdajIAb04lqaO5PfJlpKG-ljG28)\\n- [Export Query Results to NetworkX](https://colab.research.google.com/drive/1NDsnFDWcSGoaOl-mOgG0zrPG2VAr8Q6H)\\n\\n## Exporting Query Results to Pytorch Geometric and NetworkX\\nPerhaps most excitingly, we have added the first capabilities to integrate with 2 popular \\ngraph data science\\nlibraries: (i) [Pytorch Geometric](https://github.com/pyg-team/pytorch_geometric) (PyG) for performing \\ngraph machine learning; and (ii) [NetworkX](https://networkx.org/) for a variety of \\ngraph analytics, including visualization. \\n\\n### Pytorch Geometric: `QueryResult.get_as_torch_geometric()` function\\nOur [Python API](https://kuzudb.com/api-docs/python/) now has a \\nnew [`QueryResult.get_as_torch_geometric()`](https://kuzudb.com/api-docs/python/kuzu/query_result.html#QueryResult.get_as_torch_geometric) function that \\nconverts results of queries to PyG\'s in-memory graph representation \\n[`torch_geometric.data`](https://pytorch-geometric.readthedocs.io/en/latest/modules/data.html).\\nIf your query results contains nodes and relationship objects, then the function uses \\nthose nodes and relationships to construct either `torch_geometric.data.Data` or \\n`torch_geometric.data.HeteroData` objects. The function also auto-converts any numeric or boolean property \\non the nodes into tensors on the nodes that can be used as features in the `Data/HeteroData` objects.\\nAny property that cannot be auto-converted and the edge properties are also returned in case you need\\nwant to manually put them into the `Data/HeteroData` objects.\\n\\n**Colab Demonstrations:**\\nHere are 2 Colab notebooks that you can play around with to see how you can develop graph learning\\npipelines using K\xf9zu as your GDBMSs:\\n1. [Node property prediction](https://colab.research.google.com/drive/1fzcwBwTY-M19p7OOTIaynfgHFcAQo9NK)\\n2. [Link prediction](https://colab.research.google.com/drive/1QdX7CDdajIAb04lqaO5PfJlpKG-ljG28)\\n\\nThe examples demonstrate how to extract a subgraph,\\ntrain graph convolutional or neural networks (GCNs or GNNs), make some node property\\nor link predictions and save them back in K\xf9zu so you can query these predictions.\\n\\n### NetworkX: `QueryResult.get_as_networkx()` function\\nOur [Python API](https://kuzudb.com/docs/client-apis/python-api/overview.html) now has a \\nnew [`QueryResult.get_as_networkx()`](https://kuzudb.com/api-docs/python/kuzu/query_result.html#QueryResult.get_as_networkx) function that can convert query results\\nthat contain nodes and relationships into NetworkX directed or undirected graphs. Using this function, you can build pipelines\\nthat benefits from K\xf9zu\'s DBMS functionalities (e.g., querying, data extraction and transformations,\\nusing a high-level query language with very fast performance), and NetworkX\'s rich library of \\ngraph analytics algorithms.\\n\\n**Colab Demonstration:**\\nHere is a [Colab notebook](https://colab.research.google.com/drive/1NDsnFDWcSGoaOl-mOgG0zrPG2VAr8Q6H?usp=sharing#scrollTo=AkpBul7ZpUM5) \\nthat you can play around with that shows how to do basic graph visualization of query results\\nand build a pipeline that computes PageRanks of a subgraph and store those PageRank \\nvalues back as new node properties in K\xf9zu and query them.\\n\\n## Data Import from and Export to Parquet and Arrow\\nWe have removed our own CSV reader and instead now use [Arrow](https://arrow.apache.org/)\\nas our default library when bulk importing data through [`COPY FROM` statements](https://kuzudb.com/docs/data-import/csv-import.html). \\nUsing Arrow, we can not only bulk import\\nfrom CSV files but also from arrow IPC and parquet files. We detect the file type\\nfrom the suffix of the file; so if the query says `COPY user FROM ./user.parquet`,\\nwe infer that this is a parquet file and parse it so. See the details [here](/docusaurus/data-import/parquet-import).\\n\\n## Multi-labeled or Unlabeled Queries\\nA very useful feature of the query languages of GDBMSs is their\\nability to elegantly express unions of join queries. \\nWe had written about this feature of GDBMSs in this blog post about \\n[What Every Competent GDBMS Should Do](./2023-01-12-what-every-gdbms-should-do/index.md)\\n(see the last paragraph of Section `Feature 4: Schema Querying`).\\nIn Cypher, a good example\\nof this is to not bind the node and relationship variables to a specific node/relationship\\nlabels/tables. Consider this query:\\n```\\nMATCH (a:User)-[e]->(b)\\nWHERE a.name = \'Karissa\'\\nRETURN a, e, b\\n```\\nThis query asks for all types of relationships that Karissa can have to any possible other\\nnode (not necessarily of label `User`) in the query. So if the database contains \\n`Likes` relationships from `Users` to `Comments`, `Follows` relationships\\nfrom `Users` to `Users`, and `LivesIn` relationships from `Users` and `Cities`, \\nvariables e and b can bind to records from all of these\\nrelationship and node labels, respectively. \\n\\nYou can also restrict the labels of nodes/rels to a fixed set that contains\\nmore than one label using the `|` syntax.\\nFor example you can do:\\n\\n```\\nMATCH (a:User)-[e:Likes|Follows]->(b)\\nWHERE a.name = \'Karissa\'\\nRETURN a, e, b\\n```\\nThis forces e to match to only Likes relationship or Follows relationship records (so\\nexcludes the `LivesIn` records we mentioned above). The `|` is a syntax adapted from\\nregexes originally and is also used in query languages that support `regular path queries`. \\n\\nK\xf9zu now supports such queries. Our query execution\\nis based on performing scans of each possible node/rel table and index\\nand when a variable `x` can bind to multiple node/rel tables, `L1, L2, ..., Lk`,\\nwe reserve one vector for each possible property of each node/rel table. \\nIf anyone has any optimizations to do something smarter, it would be very interesting\\nto hear!\\n\\n## Other Important Changes\\n\\n### Enhanced String Features\\nWe\'ve added two important features to enhance K\xf9zu\'s ability to store and process strings:\\n\\n1) Support of UTF-8 characters. With the help of [utf8proc](https://github.com/JuliaStrings/utf8proc), you can now store string node/relationship\\n properties in K\xf9zu that has UTF-8 characters;\\n2) Support of [regex pattern matching](/docusaurus/cypher/expressions/pattern-matching) with strings. K\xf9zu now supports Cypher\'s `=~` operator for regex searches, which will return true if its pattern mathces the entire input string. For example: `RETURN \'abc\' =~ \'.*(b|d).*\';`.\\n\\n### CASE Expression\\nWe\'ve added [CASE](/docusaurus/cypher/expressions/case-expression) for conditional expressions.\\nTwo forms ([Simple Form](/docusaurus/cypher/expressions/case-expression#simple-form) and [General Form](/docusaurus/cypher/expressions/case-expression#general-form)) of CASE expression are supported.\\n\\n### ALTER/DROP/SET/DELETE\\nWe added [ALTER TABLE](/cypher/data-definition/alter) and [DROP TABLE](/cypher/data-definition/drop) DDL statements.\\nAfter creating a new node or relationship table, you can now drop it, rename it, and alter it by adding new columns/properties, \\nrenaming or dropping existing columns/properties.\\n\\nBesides schema level changes, you can change properties of existing nodes/rels with [SET](/docusaurus/cypher/data-manipulation-clauses/set) statements, and remove existing nodes/rels with [DELETE](/docusaurus/cypher/data-manipulation-clauses/delete) statements.\\n\\n### Disable Relationships with Multiple Source or Destination Labels\\nWe now no longer support defining a relationship between multiple source or destination labels.\\nThis is to simplify our storage. But please let us know if you have strong use cases on this.\\n\\nEnjoy our new release and don\'t forget to donate to the earthquake victims.\\n\\n[^1]: For T\xfcrkiye two other organizations are [AFAD](https://en.afad.gov.tr/earthquake-campaign), which is the public institute for coordinating natural disaster response and [Akut](https://www.akut.org.tr/en/donation), a volunteer-based and highly organized search and rescue group. For Syria, another campaign I can recommend is [Molham Team](https://molhamteam.com/en/campaigns/439?fbclid=IwAR3_t443XME9Gh0r75KM4VpQ58WLNPd8w8tyMV2JprdObwecPwhWAdX2FOQ), which is an organization founded by Syrian refugee students."},{"id":"factorization","metadata":{"permalink":"/docusaurus/blog/factorization","source":"@site/blog/2023-01-20-factorization/index.md","title":"Factorization & Great Ideas from Database Theory","description":"Many of the core principles of how to develop DBMSs are well understood.","date":"2023-01-20T00:00:00.000Z","formattedDate":"January 20, 2023","tags":[{"label":"internals","permalink":"/docusaurus/blog/tags/internals"}],"readingTime":22.71,"hasTruncateMarker":true,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"factorization","authors":["semih"],"tags":["internals"]},"prevItem":{"title":"K\xf9zu 0.0.2 Release","permalink":"/docusaurus/blog/kuzu-0.0.2-release"},"nextItem":{"title":"What Every Competent GDBMS Should Do (aka The Goals & Vision of K\xf9zu","permalink":"/docusaurus/blog/what-every-gdbms-should-do-and-vision"}},"content":"import TwoHopDataImage from \'./2-hop-data.png\';\\nimport TwoHopQueryPlanHashJoinImage from \'./2-hop-query-plan-hash-join.png\';\\nimport TwoHopQueryPlanExtendImage from \'./2-hop-query-plan-extend.png\';\\nimport TwoHopFactorizationExperimentImage from \'./2-hop-factorization-experiment.png\';\\nimport FlatVsFactorizedImage from \'./flat-vs-factorized.png\';\\nimport FactorizedVectorsImage from \'./factorized-vectors.png\';\\nimport FactorizedExecutionSimulationImage from \'./factorized-execution-simulation.png\';\\n\\n\\nMany of the core principles of how to develop DBMSs are well understood.\\nFor example, a very good query compilation paradigm is to \\nmap high-level queries to a logical plan of relational operators, then optimize this plan,\\nand then further map it to an executable code often in the form of a physical query plan. \\nSimilarly, if you want updates to a DBMS to be atomic and durable,\\na good paradigm is to use a write-ahead log that serves as a source of truth\\nand can be used to undo or redo operations. Many systems adopt such common wisdom paradigms. \\nAs core DBMS researcher, once in a while however, you run into a very simple idea \\nthat deviates from the norm that gets you very excited. \\nToday, I want to write about one such idea called [factorization](https://www.cs.ox.ac.uk/dan.olteanu/papers/os-sigrec16.pdf). \\n\\n\x3c!--truncate--\x3e\\n\\n:::tip Tldr: The key takeaways are:\\n- **Overview of Factorization & Why Every GDBMS Must Adopt It**: Factorization\\n is a compression technique to compress the intermediate results\\n that query processors generate when evaluating many-to-many (m-n) joins. \\n Factorization can compress an intermediate result size exponentially \\n in the number m-n joins in the query.\\n- **Example Benefits of Factorization**: Benefits of keeping intermediate\\n results smaller reduces the computation processors perform \\n on many queries. Examples include reducing copies by keeping the output\\n data size small, reducing filter and expression evaluation computations exponentially,\\n and performing very fast aggregations.\\n- **How K\xf9zu Implements Factorization:** K\xf9zu\'s query processor\\n is designed to achieve 3 design goals: (i) factorize intermediate results;\\n (ii) always perform sequential scans of database files; and (iii) avoid\\n scanning large chunks of database files when possible. In addition, the processor is \\n vectorized as in modern columnar DBMSs. These design goals are achieved by passing \\n multiple *factorized vectors* between each other and using modified HashJoin operators \\n that do *sideways information passing* to avoid scans of entire files.\\n:::\\n\\nThis is a quite technical and long blog post and will appeal more to people who are interested\\nin internals of DBMSs. It\'s about a technique that\'s quite dear to my heart called factorization,\\nwhich is a very\\nsimple data compression technique. Probably all \\ncompression techniques you know are designed to compress database files that \\nare stored on disk. Think of run-length encoding, dictionary compression, or bitpacking.\\nIn contrast, you can\'t use factorization to compress your raw database files. \\nFactorization has a very unique property:\\nit is designed to compress the intermediate \\ndata that are generated when query processors of DBMSs evaluate \\nmany-to-many (m-n) growing joins. If you have read [my previous blog](../2023-01-12-what-every-gdbms-should-do/index.md),\\nefficiently handling m-n joins was one of the items on my list of properties \\nthat competent GDBMSs should excel in. This is because \\nthe workloads of GDBMSs commonly contain m-n joins\\nacross node records. Each user in a social network or an account in a financial transaction network\\nor will have thousands of connections and if you want\\na GDBMS to find patterns on your graphs, you are \\nasking queries with m-n joins. Factorization is directly designed\\nfor these workloads and because of that every competent GDBMS must develop \\na factorized query processor. In fact, if I were to try to write a new analytical RDBMS,\\nI would probably also integrate factorization into it.\\n\\nThis post forms the 2nd part of my 3-part posts on the contents of our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf)\\nwhere we introduced K\xf9zu. The 3rd piece will be on another technique called worst-case \\noptimal join algorithms, which is also designed for a specific class of m-n joins.\\nBoth in this post and the next, I have two goals. First is to try to articulate these techniques \\nusing a language that is accessible to general software engineers. \\nSecond, is to make people appreciate the role of \\npen-and-paper theory in advancing the field of DBMSs. Both of these techniques were first \\narticulated in a series of purely theoretical papers which gave excellent \\npractical advice on how to improve DBMS performance. \\nCredit goes to the great theoreticians who pioneered these techniques whom I will cite\\nin these posts. Their work should be highly appreciated.\\n\\n## A Quick Background: Traditional Query Processing Using Flat Tuples\\nHere is a short background on the basics of\\nquery processors before I explain factorization. If you know about \\nquery plans and how to interpret them,\\nyou can skip to [here](#factorization-in-a-nutshell) after reading\\nmy running example.\\nConsider a database of Account node and Transfer edge records below.\\nThe two Accounts with `accID` fields L1 and L2 are owned by Liz and \\neach have 100 incoming and 100 outgoing Transfer edges.\\n\\n
\\n\\n
\\n\\nNow consider a 2-hop path query in Cypher returning the accID\'s of source\\nand destinations of money flows Liz\'s accounts are facilitating:\\n\\n``` \\nMATCH (a:Account)-[t1:Transfer]->(b:Account)-[t2:Transfer]->(c:Account)\\nWHERE b.name = \'Liz\' \\nRETURN a.accID, c.accID\\n```\\n\\nHere\'s the SQL version of the query if you modeled your records as relations.\\nSame query different syntax:\\n```\\nSELECT a.accID, c.accID\\nFROM Account a, Transfer t1, Account b, Transfer t2, Account c\\nWHERE b.name = \'Liz\' AND\\n t1.src = a.accID AND t1.dst = b.accID AND\\n t2.src = b.accID AND t2.dst = c.accID\\n```\\n\\nA standard query plan for this query is shown on the left in Fig. 2. \\nThe plan contains some Scan operators to scan the raw node or edge records (edges could be \\nscanned from a join index) and some hash join operators to perform the joins, and \\na final projection operator.\\nIn some GDBMSs, you might see \\"linear plans\\" that look as in Fig. 3.\\n\\n

\\n \\n \\n

\\n\\nThe linear plan is from our previous GraphflowDB system. Here\\nyou are seeing an operator called Extend, which joins node records with their Transfer relationships to \\nread the system-level IDs of the neighbors of those node records. \\nFollowing the Extend is another Join operator to join the accID properties of those neighbors \\n(specifically c.accID and a.accID). \\nIn Neo4j, you\'ll instead see an Expand(All) operator, which does the Extend+Join\\nin GraphflowDB in a single operator[^1]. For very good reasons\\nwe removed these Extend/Expand type operators in K\xf9zu. I will come back to this.\\n\\nThe interpretation of plans is that tuples are flowing from the bottom to top and\\neach operator will take in sets of tuples and produce sets of tuples (in a pipelined fashion). \\nThe key motivation for factorization is that what flows \\nbetween operators are **flat tuples**. When the joins are m-n, this \\nleads to many data repetitions, which one way or another leads to repeated\\ncomputation in the operators. For example,\\nthe final projection operator in our example would take the table shown in Figure 4 (left).\\n
\\n\\n
\\n\\n\\nThere are 20K tuples in the flat representation because both L1 and L2 are part of \\n100 incoming x 100 outgoing=10K many 2-paths. Notice the many repetitions in this relation:\\nL1, L2, or Liz values, or the values in a.accID and c.accID. \\nWhat gets replicated may change across systems. Some may replicate the actual values,\\nsome may replicate indices where these values are stored but overall exactly 20K\\ntuples would be generated. This redundancy leads to redundant computation here and there\\nduring query processing.\\n\\n## Factorization In a Nutshell\\nFactorization addresses exactly this problem. The core reason for the redundancy\\nis this observation: *given a fixed b value, all a\'s and c\'s are conditionally independent*.\\nMore concretely, once b is bound to node L1, each incoming neighbor `a` for L1 will join \\nwith each outgoing neighbor `c` of L1. If you took the first standard undergraduate course in DBMSs at a university\\nand you covered the theory of normalization, this is what is \\ncalled a [multi-valued dependency](https://en.wikipedia.org/wiki/Multivalued_dependency)\\nin relations. Factorization exploits such dependencies to compress\\nrelations using Cartesian products.\\nAbove in Figure 4 (right),\\nI\'m showing the same 20K tuples in a factorized format using only 400 values\\n(so 2\\\\*(100+100) instead of 2\\\\*100\\\\*100 values). \\n\\nThat\'s it! That\'s the core of the idea! Now of course, this simple observation leads to a ton of \\nhard and non-obvious questions that the entire theory on factorization answers. For example, \\ngiven a query, what are the \\"factorization structures\\", i.e., the Cartesian product structures\\nthat can be used to compress it? Consider a simple query that counts the number of\\npaths that are slightly longer:\\n```\\nMATCH (a:Account)-[:Wire]->(b:Account)-[:Deposit]>(c:Account)-[:ETransfer]->(d:Account)\\nRETURN count(*)\\n```\\nShould you condition on b and factor out \\na\'s from (c, d)\'s or condition on c and factor out (a, b)\'s from d\'s? \\nOr you could condition on (b, c) and factor out (a)\'s from (d)\'s?\\nTo make a choice, a system has to reason about the number of Wire, Deposit,\\nand ETransfer records in the database.\\nHow much and on which queries can you benefit from factorization?\\nThe theoretical questions are endless. \\nThe theory of factorization develops the formal foundation so that such questions can be answered and \\nprovides principled first answers to these questions. \\n[Dan Olteanu](https://www.ifi.uzh.ch/en/dast/people/Olteanu.html) and his \\ncolleagues, who lead this field, recently won the [ICDT test of time award](https://databasetheory.org/ICDT/test-of-time)\\nfor their work on factorization. ICDT is one of the two main \\nacademic venues for theoretical work on DBMSs.\\n\\nBut let\'s take a step back and appreciate this theory because it gives an excellent \\nadvice to system developers: *factorize your intermediate\\nresults if your queries contain many-to-many joins!* \\nRecall that GDBMSs most commonly evaluate many-to-many joins. So hence my point that \\nGDBMSs should develop factorized query processors.\\nThe great thing this theory shows us is that this can all be done by static analysis of the query \\nduring compilation time by only inspecting the dependencies between variables in\\nthe query! I won\'t cover the exact rules but at least in my running example,\\nI hope it\'s clear that because there is no predicate between a\'s and c\'s, once\\nb is fixed, we can factor out a\'s from c\'s.\\n\\n## Examples When Factorization Significantly Benefits:\\nFactorized intermediate relations can be exponentially smaller\\n(in terms of the number of joins in the query)\\nthan their flat versions, which \\ncan yield orders of magnitude speed ups in query performance \\nfor many different reasons. I will discuss three most obvious ones.\\n\\n### Less Data Copies/Movement \\nThe most obvious benefit is that factorization reduces\\nthe amount of data copied between buffers used by operators\\nduring processing and to final `QueryResult` structure\\nthat the application gets access to. For example, a very cool feature of K\xf9zu \\nis that it keeps final outputs in factorized format in its `QueryResult` class and \\nenumerates them one by one only when the user starts calling `QueryResult::getNext()`\\nto read the tuples.\\nIn our running example, throughout processing K\xf9zu would do copies of\\n400 data values roughly instead of 20K to produce its `QueryResult`. \\nNeedless to say, I could have picked a more exaggerated query, say a \\"star\\" query\\nwith 6 relationships, and arbitrarily increased the difference in the copies done \\nbetween a flat vs factorized processor.\\n\\n### Fewer Predicate and Expression Evaluations\\nFactorization can also reduce the amount of predicate or expression executions the system performs.\\nSuppose we modify our 2-hop query a bit and put two additional filters on the query:\\n```\\nMATCH (a:Account)-[e1:Transfer]->(b:Account)-[e2:Transfer]->(c:Account)\\nWHERE b.name = \'Liz\' AND a.balance > b.balance AND c.balance > b.balance\\nRETURN *\\n```\\nI\'m omitting a plan for this query but a common plan would extend the plan in Figure 2 (or 3) above\\nto also scan the balance properties and to run two filter operations: \\n(i) above the join that joins a\'s and b\'s,\\nto run the predicate `a.balance > b.balance`; (ii) after the final join in Figure 2\\nto run the predicate `c.balance > b.balance`. Suppose the first filter did not eliminate any tuples.\\nThen, a flat processor would evaluate 20K filter executions in the second filter.\\nIn contrast, the input to the second filter operator in a factorized processor \\nwould be the 2 factorized tuples \\nshown in Figure 4 (right) but extended with `balance` properties\\non a, b, and c\'s. Therefore there would be only 200 filter executions: (i) \\nfor the first factorized tuple, there are only\\n100 comparisons to execute `c.balance > b.balance` since b is matched to a single\\nvalue and there are 100 c values.; (ii) similarly for the 2nd factorized tuple.\\nWe can obtain similar benefits when running other expressions.\\n\\n### Aggregations\\nThis is perhaps where factorization yields largest benefits.\\nOne can perform several aggregations directly on factorized tuples using\\n algebraic properties of several aggregation functions. Let\'s\\nfor instance modify our above query to a count(\\\\*) query: Find the number of 2-paths that Liz is \\nfacilitating. A factorized processor can simply count that there are 100\\\\*100 flat tuples in the first\\nfactorized tuple and similarly in the second one to compute that the answer is 20K.\\nOr consider doing min/max aggregation on factorized variables:\\n```\\nMATCH (a:Account)-[e1:Transfer]->(b:Account)-[e2:Transfer]->(c:Account)\\nWHERE b.accID = \'L1\'\\nRETURN max(a.balance), min(c.balance)\\n```\\nThis is asking: find the 2-path money flow that Liz\'s L1 account facilitates from the highest\\nto lowest balance accounts (and only print the balances). If a processor \\nprocesses the 10K 2-paths that L1 is part of in factorized form, then \\nthe processor can compute the max and min aggregations\\nwith only 100 comparisons each (instead of 10K comparisons each). \\n\\nIn short, the benefits of factorizing intermediate results just \\nreduces computation and data copies here and there in many cases.\\nYou can try some of these queries on K\xf9zu and compare its performance on large \\ndatasets with non-factorized systems. \\n\\n## How Does K\xf9zu Perform Factorized Query Processing?\\nThe rest will be even more technical and forms part of the technical meat of our CIDR paper; \\nso continue reading if you are interested in database implementations.\\nWhen designing the query processor of K\xf9zu, we had 3 design goals: \\n1. Factorize intermediate growing join results. \\n2. Always perform sequential scans of database files from disk.\\n3. When possible avoid scanning entire database files from disk.\\n\\n3rd design goal requires some motivation, which I will provide below. Let\'s go one by one.\\n\\n### 1. Factorization \\nK\xf9zu has a vectorized query processor, which is the common wisdom\\nin analytical read-optimized systems. \\n\\nVectorization, in the context of DBMS query processors \\nrefers to the design where operators pass a set of tuples, 1024 or 2048, \\nbetween each other during processing[^2]. Existing vectorized query processors (in fact \\nprocessors of all systems I\'m aware of) pass *a single vector of flat tuples*.\\nInstead, K\xf9zu\'s operators pass (possibly) multiple *factorized vectors of tuples* \\nbetween each other. Each vector can either be *flat* and represent a single value or \\n*unflat* and represent a set of values, which is marked in a field called `curIdx`\\nassociated with each vector.\\nFor example, the first 10K tuples from my running example would be represented\\nwith 3 factorized vectors as on the left and would be passed to the final projection\\nin the query plan in Figure 2.\\nThe interpretation is this: what is passed is the Cartesian product of all sets of\\ntuples in those vectors. Operators know during compilation time how many vector\\ngroups they will take in and how many they will output. Importantly, we still\\ndo vectorized processing, i.e., each primitive operator operates on a vector of values\\ninside tight for loops. \\nCredit where credit\'s due: this simple-to-implement design was proposed \\nby my PhD student [Amine Mhedhbi](http://amine.io/) with some feedback from \\nme and my ex-Master\'s student \\n[Pranjal Gupta](https://www.linkedin.com/in/g31pranjal/?originalSubdomain=in)\\nand [Xiyang Feng](https://www.linkedin.com/in/xingyang-feng-14198491/?originalSubdomain=ca), \\nwho is now a core developer of K\xf9zu. \\nAnd we directly adopted it in K\xf9zu. Amine has continued doing other excellent\\nwork on factorization, which we have not yet integrated, and you\\nwill need to wait until his PhD thesis is out.\\n\\n### 2. Ensuring Sequential Scans\\nI already told you above that \\nExtend/Expand type join operators lead to non-sequential scans of database files.\\nThese operators are not robust and if you are developing a disk-based system:\\nnon-sequential scans will kill you on many queries. That\'s a mistake. Instead, \\nK\xf9zu uses (modified) HashJoins which are much more robust. HashJoins do not perform any scans\\nas part of the actual join computation so if the down stream scans\\nare sequential, you get sequential scans. I\'ll give a simulation momentarily.\\n\\n### 3. Avoiding Full Scans of Database Files\\nAlthough I don\'t like Extend/Expand-type join operators,\\nthey have a performance advantage. Suppose you had a simple 1-hop query that only asked for\\nthe names of accounts that Liz\'s L1 account has transfered money to:\\n```\\nMATCH (a:Account)-[:Transfer]->(b:Account)\\nWHERE a.accID = \'L1\'\\nRETURN b.name\\n```\\nSuppose your database has billions of transfers but L1 has made only 3 transfers to\\naccounts with system-level record/node IDs: 107, 5, and 15. Then if you had\\na linear plan like I showed in Figure 3, then an Extend/Expand-type\\noperator could read these system-level IDs and then only scan\\nthe name properties of these 3 nodes, avoiding the full scan of the names\\nof all Accounts. If your query needs to read neighborhoods of millions of nodes, \\nthis type of computation that \\"reads the properties of each node\'s neighbors\\"\\nwill degrade very quickly because: (i) each neighborhood \\nof each node will require reading\\ndifferent parts of the disk files that store those properties; and (ii)\\nthe system might repeatedly read the same properties over and over from disk,\\nas nodes share neighbors.\\nInstead, you want to\\nread all of the properties and create a hash table and read those properties\\nfrom memory. \\nHowever, if your query is accessing the neighborhoods of a few nodes,\\nthen avoiding the scan of entire database file is an advantage.\\nIn K\xf9zu, we wanted to use HashJoins but we also wanted a mechanism to scan \\nonly the necessary parts of database files. We\\ndo this through a technique called *sideways information passing*[^3]. \\nI\'ll simulate this below.\\n\\n### A Simple Simulation\\nFor simplicity, we\'ll work on a simpler 1-hop query, so the benefits of factorization will not \\nbe impressive but it will allow me to explain an entire query processing pipeline.\\nConsider this count(\\\\*) query that counts the number of transfers the L1 account has made:\\n```\\nMATCH (a:Account)-[t1:Transfer]->(b:Account)\\nWHERE a.accID = L1\\nRETURN count(*)\\n```\\nAn annotated query plan we generate is shown below. The figure shows step by step\\nthe computation that will be performed and the data that will be passed between operators.\\nFor this simulation, I am assuming that the record/nodeIDs of Accounts are as in \\nFigure 1a above.\\n\\n\\n\\n1. A Scan operator will scan the accId column and find the records of\\nnodes with accID=L1. There is only 1 tuple (199, Liz) that will be output.\\n2. This tuple will passed to HashJoin\'s build side, which will create a hash table from it.\\n3. At this point the processor knows exactly the IDs of nodes, whose Transfer edges need\\nto be scanned on the probe side: only the edges of node with ID 199! This is where we \\ndo sideways information passing.\\nSpecifically, the HashJoin constructs and passes a \\"nodeID filter\\" (effectively a bitmap) \\nto the probe side Scan operator. Here, I\'m assuming the database has 1M Accounts but as you \\ncan see only the position 199 is 1 and others are 0.\\n4. The probe-side Scan uses the filter to only scan\\nthe edges of 199 and avoids\\nscanning the entire Transfers file.\\nSince K\xf9zu is a GDBMS, we store the edges of nodes (and their properties) \\nin a graph-optimized format called [CSR](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)). \\nImportantly, all of the edges of 199 are stored consecutively and we output them in factorized format as:\\n[(199) X {201, 202, ..., 300}].\\n5. Next step can be skipped in an optimized system but currently we will probe the [(199) X {201, 202, ..., 300}]\\n tuple in the hash table and produce [(199, L1) X {201, 202, ..., 300}], which is passed to the \\n final aggregation operator.\\n6. The agggregation operator counts that there are 100 \\"flat\\" tuples in [(199, L1) X {201, 202, ..., 300}], simply\\n by inspecting the size of the 2nd vector {201, 202, ..., 300} in the tuple.\\n\\nAs you see the processing was factorized, we only did sequential scans\\nbut we also avoided scanning the entire Transfer database file, achieving our 3 design goals.\\nThis is a simplifid example and there are many queries that are more complex and where we \\nhave more advanced modified hash join operators. But the simulation presents all core techniques\\nin the system. You can read our [CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) \\nif you are curious about the details!\\n\\n### Example Experiment\\nHow does it all perform? Quite well! Specifically this type of processing is quite robust. \\nHere\'s an experiment from our CIDR paper to give a sense of the behavior of\\nusing modified hash joins and factorization on a micro benchmark query. This query \\ndoes a 2-hop query with aggregations on every node variable. This is on \\nan [LDBC](https://ldbcouncil.org/benchmarks/snb/)\\nsocial network benchmark (SNB) dataset at scale factor 100 (so ~100GB of database). LDBC SNB \\nmodels a social network where user post comments and react to these comments. \\n```\\nMATCH (a:Comment)<-[:Likes]-(b:Person)-[:Likes]->(c:Comment)\\nWHERE b.ID < X\\nRETURN min(a.ID), min(b.ID), min(c.ID)\\n```\\nNeedless to say, we are picking this as it is a simple query that can demonstrate\\nthe benefits of all of the 3 techniques above. Also needless to say, we could have exaggerated\\nthe benefits by picking\\nlarger stars or branched tree patterns but this will do.\\nIn the experiment we are changing the selectivity of the predicate on the middle node, which\\nchanges the output size. What we will compare is the behavior of K\xf9zu, which integrates\\nthe 3 techniques above with (i) K\xf9zu-Extend: A configuration of K\xf9zu that uses factorization but instead of\\nour modified HashJoins uses an Extend-like operator;\\nand (ii) [Umbra](https://umbra-db.com/)[^4], which represents the\\nstate of the art RDBMS. Umbra is as fast as existing RDBMSs get. It probably integrates\\nevery known low-level performance technique in the field.\\nUmbra however does not \\ndo factorization or have a mechanism to avoid scanning entire database files, so we\\nexpect it to perform poorly on the above query. \\n\\nHere\'s the performance table.\\n\\nWhen the selectivity is very low, Extend-like operators + factorization do quite well\\nbecause they don\'t yet suffer much from non-sequential scans and they avoid several overheads\\nof our modified hash joins: no hash table creation and no semijoin filter mask creation. \\nBut they are not robust and degrade quickly. We can also see that even if you\'re Umbra, \\nwithout factorization or a mechanism to avoid scanning entire files, \\nyou will not perform very well on these queries with m-n joins (even if there is only 2 of them here). \\nWe conducted several other experiments all demonstrating the robustness and scalability\\nof factorized processing using modified hash join operators. I won\'t cover them but\\nthey are all in [our CIDR paper](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf).\\n\\n## Final marks: \\nI am convinced that modern GDBMSs have to be factorized systems to remain \\ncompetitive in performance. If your system assumes that most joins will be growing,\\nfactorization is one of a handful of modern technique for such workloads \\nwhose principles are relatively well understood\\nand one can actually implement in a system. I am sure different factorized query processors will\\nbe proposed as more people attempt at it. I was happy to see in CIDR that at least 2 systems\\ngurus told me they want to integrate factorization into their systems. \\nIf someone proposes a technique that can on some queries\\nlead to exponential computation reductions even in a pen-and-paper theory, it is a good sign\\nthat for many queries it can make the difference between a system timing out vs providing \\nan actual answer.\\n \\nFinally there is much more on the theory of factorization, which I did not cover. From my side, \\nmost interestingly, there \\nare even more compressed ways to represent the intermediate results than the \\nvanilla Cartesian product scheme I covered in this post. Just to raise some curiosity, what I have \\nin mind is called \\n[d-representations](https://fdbresearch.github.io/principles.html) but that will have to wait \\nfor another time. For now, I invite you to check our performance out on large queries \\nand let us know if we are slow on some queries! The K\xf9zu team says hi (\ud83d\udc4b \ud83d\ude4b\u200d\u2640\ufe0f \ud83d\ude4b\ud83c\udffd) and \\nis at your service to fix all performance bugs as we continue implementing the system! \\nMy next post will be about the novel [worst-case optimal join algorithms](../2023-02-22-wcoj/index.md), which emerged\\nfrom another theoretical insight on m-n joins! Take care until then!\\n\\n[^1]: If you come from a very graph-focused background and/or exposed to a ton of GDBMS marketing, you might react to my statement that what I am showing are standard plans that do joins. Maybe you expected to see graph-specific operators, such as a BFS or a DFS operator because the data is a graph. Or maybe someone even dared to tell you that GDBMSs don\'t do joins but they do traversals. Stuff like that. These word tricks and confusing jargon really has to stop and helps no one. If joins are in the nature of the computation you are asking a DBMSs to do, calling it something else won\'t change the nature of the computation. Joins are joins. Every DBMSs needs to join their records with each other.\\n\\n[^2]: Vectorization emerged as a design in the context of columnar RDBMSs, which are analytical systems, about 15-20 years old. It is still a very good idea. The prior design was to pass a single tuple between operators called Volcano-style tuple-at-a-time processing, which is quite easy to implement, but quite inefficient on modern CPUs. If you have access to the following link, you can read all about it from the pioneers of [columnar RDBMSs](https://www.nowpublishers.com/article/Details/DBS-024).\\n\\n[^3]: Note that GDBMSs are able to avoid scans of entire files because notice that they do the join on internal record/node IDs, which mean something very specific. If a system needs to scan the name property of node with record/node ID 75, it can often arithmetically compute the disk page and offset where this is stored, because record IDs are dense, i.e., start from 0, 1, 2..., and so can serve as pointers if the system\'s storage design exploits this. This is what I was referring to as \\"Predefined/pointer-based joins\\" in my [previous blog post](../2023-01-12-what-every-gdbms-should-do/index.md). This is a good feature of GDBMSs that allows them to efficiently evaluate the joins of node records that are happening along the \\"predefined\\" edges in the database. I don\'t know of a mechanism where RDBMSs can do something similar, unless they develop a mechanism to convert value-based joins to pointer-based joins. See my student [Guodong\'s work last year in VLDB](https://www.vldb.org/pvldb/vol15/p1011-jin.pdf) of how this can be done. In K\xf9zu, our sideways information passing technique follows Guodong\'s design in this work.\\n\\n[^4]: Umbra is being developed by [Thomas Neumann](https://www.professoren.tum.de/en/neumann-thomas) and his group. If Thomas\'s name does not ring a bell let me explain his weight in the field like this. As the joke goes, in the field of DBMSs: there are gods at the top, then there is Thomas Neumann, and then other holy people, and then we mere mortals."},{"id":"what-every-gdbms-should-do-and-vision","metadata":{"permalink":"/docusaurus/blog/what-every-gdbms-should-do-and-vision","source":"@site/blog/2023-01-12-what-every-gdbms-should-do/index.md","title":"What Every Competent GDBMS Should Do (aka The Goals & Vision of K\xf9zu","description":"As a co-implementor of the K\xf9zu GDBMS and","date":"2023-01-12T00:00:00.000Z","formattedDate":"January 12, 2023","tags":[{"label":"vision","permalink":"/docusaurus/blog/tags/vision"}],"readingTime":18.77,"hasTruncateMarker":true,"authors":[{"name":"Semih Saliho\u011flu","title":"CEO of K\xf9zu Inc & Associate Prof. at UWaterloo","url":"https://cs.uwaterloo.ca/~ssalihog/","imageURL":"https://kuzudb.com/img/blog/semih.jpg","key":"semih"}],"frontMatter":{"slug":"what-every-gdbms-should-do-and-vision","authors":["semih"],"tags":["vision"]},"prevItem":{"title":"Factorization & Great Ideas from Database Theory","permalink":"/docusaurus/blog/factorization"},"nextItem":{"title":"Meet K\xf9zu \ud83e\udd17","permalink":"/docusaurus/blog/meet-kuzu"}},"content":"import BachmannImage from \'./bachmann.png\';\\nimport DiamondPatternImage from \'./diamond-pattern.png\';\\nimport ExFwdJoinIndexImage from \'./ex-fwd-join-index.png\';\\nimport KuzuAsGDBMSOfGDSImage from \'./kuzu-as-gdbms-of-gds.png\';\\n\\n\\nAs a co-implementor of the K\xf9zu GDBMS and\\na professor at University of Waterloo,\\nI have been thinking of GDBMSs day in and day out for many years now.\\nAfter years of understanding and publishing on the architectural principles \\nof graph data management ([1](http://www.vldb.org/pvldb/vol12/p1692-mhedhbi.pdf), \\n[2](https://www.vldb.org/pvldb/vol14/p2491-gupta.pdf), \\n[3](https://www.vldb.org/pvldb/vol15/p1011-jin.pdf),\\n[4](https://www.vldb.org/pvldb/vol15/p1533-chen.pdf)),\\nwe decided to develop \\n[K\xf9zu](https://github.com/kuzudb/kuzu) as a state-of-the-art modern embeddable GDBMS. \\nThis post covers my broad opinions on GDBMSs, and the feature set they should\\noptimize for and why. In doing so, it also gives an overall vision of K\xf9zu!\\n\\n\x3c!--truncate--\x3e\\n\\n:::tip Tldr: The key takeaways are:\\n- **Overview of GDBMSs**: GDBMSs are relational in their cores but offer an elegant graph model\\n to model application data and SQL-like query languages with elegant\\n graph-specific syntax. Many applications, e.g., in [fraud detection](https://tinyurl.com/3x89ceum), \\n [recommendations](https://www.tigergraph.com/solutions/recommendation-engine/),\\n [personalization](https://tinyurl.com/3z9bckmm), etc. benefit from such modeling and query language features.\\n- **Key Feature Set of GDBMSs**: Despite being relational, GDBMSs optimize (or at\\n least they should!) for a distinct set of\\n features/use cases that RDBMSs do not traditionally optimize for: (i) pre-defined/pointer-based joins;\\n (ii) growing many-to-many joins;\\n (iii) recursive joins;\\n (iv) schema querying; \\n (v) efficient storage of semi-structured data and URIs.\\n GDBMSs that want to be competitive in terms of performance\\n need to perfect this feature set and that\'s exactly what K\xf9zu aims to do!\\n- **K\xf9zu as the GDBMS for Graph Data Science**: \\n One example application domain the K\xf9zu team is excited about is \\n to be a usable, efficient, and scalable GDBMS of graph data science in the Python graph analytics ecosystem. \\n Here we are looking at how DuckDB revolutionized tabular data science and\\n want to repeat it in graph data science! \\n:::\\n\\n\\nThis week, I presented K\xf9zu to the database community at the [CIDR 2023](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf) \\nconference in Amsterdam. For those who are not familiar with academic database conferences, \\nCIDR brings together work from academia and industry to discuss recent research on \\nsystems aspects of database technology. Our paper was about K\xf9zu\'s \\ngoals and vision and its core query processor design for evaluating complex growing joins.\\nWe intentionally targeted CIDR for our paper because of its systems \\nfocus and we thought many system gurus would be there: the attendees included \\ncreators of [MonetDB](https://www.monetdb.org/), [Vectorwise](https://en.wikipedia.org/wiki/Vectorwise), \\n[DuckDB](https://duckdb.org/), \\n[Snowflake](https://www.snowflake.com/en/), [Databricks](https://www.databricks.com/), amongst others. It also meant a lot to share \\nour ambitious goal of developing a usable GDBMS from an academic setting in this CIDR because\\nit was organized locally by CWI. The late [Martin Kersten](https://en.wikipedia.org/wiki/Martin_L._Kersten) \\nfounded the CWI database group and was a pioneer of this style of research projects and \\nhis successors are continuing this tradition very successfully today. \\nCWI has created many successful DBMSs, including MonetDB (Martin\'s legacy), Vectorwise, and \\nmost recently DuckDB. People paid their respects to Martin during an emotional memorial \\non the first night of the conference.\\nAs a surprise, [MemGraph](https://memgraph.com/) co-founder and CTO [Marko Budiseli\u0107](https://www.linkedin.com/in/markobudiselic/) \\nwas also there (it was his first CIDR)! Marko is an extremely friendly \\nand humble person you should meet and it was great to share our insights about where GDBMSs make a difference in \\nenterprise applications.\\n\\nI want to start a 3-part blog post to cover the contents of our CIDR paper in a less academic language: \\n- Post 1: K\xf9zu\'s goals and vision as a system \\n- Post 2: [Factorization technique for compression](../2023-01-20-factorization/index.md)\\n- Post 3: [Worst-case optimal join algorithms](../2023-02-22-wcoj/index.md)\\n\\nIn this Post 1, I discuss the following: \\n (i) [an overview of GDBMSs](#overview-of-gdbms-and-a-bit-of-history).\\n (ii) [the features GDBMSs should optimize for and why;](#features-every-competent-gdbms-should-optimize-for-) and \\n (iii) [an example application domain (graph data science!) we are immediately targeting with K\xf9zu. ](#k\xf9zu-as-a-gdbms-for-graph-data-science-pipelines)\\n(ii) and (iii) should give you a good idea about the current goals and \\nvision of K\xf9zu. If you know GDBMSs well, you should skip over (i).\\n\\n## Overview of GDBMSs and a Bit of History \\nIn one sentence, GDBMSs are read-optimized analytical DBMSs for modeling and querying application \\ndata as a graph. As a consequence they are optimized for fast querying of node and \\nrelationship records. \\nModern GDBMSs, such as Neo4j, Tigergraph, MemGraph, or K\xf9zu, \\nadopt the [property graph data model](https://neo4j.com/developer/graph-database/#property-graph)\\n(or its variants), where you can model your records as a set of labeled nodes and \\nedges/relationships, and key-value properties on these relationships. When\\nI say GDBMSs in this post, I specifically refer to the systems that adopt this\\nmodel but I will also discuss [RDF systems](https://en.wikipedia.org/wiki/Triplestore) (aka triplestores) \\nhere and there, which are also DBMSs that adopt a graph-based model.\\n\\nHere\'s a side comment that I have to make because I\'m a professor and\\nprofessors are always ready to profess.\\nDBMSs based on graph models are anything but new. They have existed even before the relational\\nmodel: DBMS die-hards love remembering \\nthat the [IDS system](https://en.wikipedia.org/wiki/Integrated_Data_Store) from 1960s was based on the \\"network model\\",\\nwhich is is just another term for graph. IDS was lead by the amazing \\nCharlie Bachmann, [1](https://amturing.acm.org/award_winners/bachman_9385610.cfm),\\n[2](https://youtu.be/iDVsNqFEkB0), [3](https://youtu.be/jByIpJNrm50)), whose photo is shown on the left and who is credited for inventing DBMSs[^1].\\nIf you click on [this 1962 ad of the IDS system](http://wp.sigmod.org/wp-content/uploads/2012/12/image4.jpg), you will see a graph of node and \\nedge records. Note 1960s are pre-relational times. Ever since, every decade has seen a surge of DBMSs \\nthat adopted a graph-based model with mixed levels of adoption success:\\nhierarchical model, XML, and RDF are examples.\\nIn my view, current property GDBMSs is the most generic\\nand suitable to model a very broad range of application data out of these.\\nSo they probably established themselves most successfully out of these. \\nThere is a very fundamental reason why graph-based DBMSs have always existed and will\\nalways exist: graphs and tables are the two most natural and generic abstract data structures \\nto model application data. It\'s no surprise they were the first two proposed data models\\nwhen the field of DBMSs were born and both have existed ever since and will continue to exist.\\n\\nBack to property GDBMSs. How about their query languages? They support SQL-like high-level \\nquery languages with several graph-specific syntax. \\nI call them \\"graph-specific\\" SQL. Let\'s look at a query snippet. Assume this is\\non a database that models a set of financial \\"accounts\\" and money \\"transfers\\"\\nbetween accounts:\\n\\n```\\nMATCH (a:Account)-[e:Transfer]->(b:Account)\\nWHERE a.name = \'Alice\'\\nRETURN b.ID\\n```\\nThis is a query expressed in Cypher. Instead of a SELECT/FROM/WHERE, \\nyou are looking at MATCH/WHERE/RETURN. \\nIf intelligent Martians saw Cypher and SQL, their immediate reaction \\nwould not be to notice the minor syntactic differences but instead\\nthe fundamental similarities: their clauses describe joins,\\nfilters, projections, group by and aggregates, and other relational \\noperations that process sets of tuples.\\nThere is of course syntactic differences that are important. Query languages of \\nGDBMSs adopt graph-specific syntax that are often very elegant to express several computations.\\nFor example, the arrow syntax ((a)-[e]->(b)) in Cypher describes joins between node records. This\\nis much more elegant than listing names of tables that model \\nnode records in a FROM clause, with a complex WHERE clause. \\nMuch more importantly, they adopt a very elegant and direct syntax,\\nsuch as the Kleene star \\"*\\", to \\nexpress recursive queries. Expressing recursive computations with vanilla SQL is \\nobjectively harder. I\'ll come to recursive queries later.\\n\\n\\nNow get ready for a blasphemous observation: *GDBMSs are relational at their cores!*[^2]. \\nWell, OK anyone who has studied the principles of DBMSs knows there is nothing \\nblasphemous here because GDBMSs actually have to be relational\\nbecause of this simple fact: \\n*the only known practical way to implement declarative high-level\\nquery languages is to compile them to relational operators that\\ntake in and output sets of tuples*. Type \\"Explain\\" to any of your\\nqueries in your favorite GDBMs (or RDF system) and look at their query plans and\\nyou will see joins, scans, filters, projections, group bys, unions,\\nintersections, etc. You might see some graph-specific operators\\nbut they will also be processing sets of tuples. That was the primary\\nobservation of [Ted Codd](https://en.wikipedia.org/wiki/Edgar_F._Codd) when he proposed\\nthat data management should be done by systems implementing\\nrelational operators that process sets of tuples. \\n\\nBut don\'t worry, I do love GDBMSs and you should too! The fact that at their cores \\nGDBMSs are relational doesn\'t mean they don\'t offer value beyond RDBMSs.\\nDBMSs are very complex software systems and they make a ton of design tradeoffs in terms of\\nwhat they optimize for. There is a very distinctive set of technical features that \\nGDBMSs should optimize for and excel in, where RDBMSs and SQL traditionally don\'t.\\nThis feature set is exactly what \\nK\xf9zu aims to perfect over time, which is what I hope to articulate in this post.\\nIn short: GDBMSs do offer a ton of value if \\nthey are architected correctly and every software engineer should know \\nabout GDBMSs[^3].\\n\\n## Features Every Competent GDBMS Should Optimize For [^4]\\nHere is a list of features that differentiate GDBMSs from RDBMSs and GDBMS should\\nhighly optimize for and support.\\n\\n### Feature 1: Pre-defined/pointer-based Joins\\nThis is perhaps the most ubiquitously adopted technique in GDBMSs that is ubiquitously missing in RDBMSs. \\nAlthough GDBMSs\\ncan join arbitrary node records with each other, most common user queries in GDBMSs\\njoin node records with their \\"neighbors\\". A GDBMS knows about these\\nneighbor node records because they are predefined to the system as relationships.\\nSo GDBMSs universally exploit this and optimize for these types of joins. For example,\\nalmost universally they all create a **join index** (aka an adjacency list index)[^5].\\nHere\'s a demonstrative example showing a \\"forward\\", i.e., from src to dst, join index:\\n\\n\\n\\n\\nNote that the join index does not store the actual data values, which\\nare strings (e.g., \\"Ali\\", \\"Noura\\", etc.) in the example. Instead, \\nit stores dense system level node record IDs.\\nAs a result, GDBMSs can be fast on these joins because they can use: (1) the join index;\\nand (2) dense integer IDs to joins (instead of, say running string equality conditions). \\n\\n### Feature 2: Many-to-many Growing Joins\\nIn many application data stored on GDBMSs, node records\\nhave many-to-many relationships with each other. Think of any data as a graph, \\nsay a network of financial transactions or who bought which items or\\nwho is friends with whom. In many of these datasets, an entity/node connects with \\nmany other nodes. In addition, many of the killer apps of GDBMSs search for complex patterns\\non these relationships. \\nA classic example we like using is a Twitter friend recommendation engine that is looking for diamond patterns to implement\\nthe following rule: If a user A follows two users B and C, who both follow D, recommend\\nD to A. This is the pattern:\\n\\n
\\n\\n
\\n\\n\\nThe whitepapers of existing GDBMSs are full of these patterns, e.g., branching trees, money laundering circles,\\ncliques of customers who buy similar items, etc. These correspond to complex\\nmany-to-many joins, which by their nature are growing. If on average each of your nodes \\nconnect with k other nodes and you have t many relationships in the pattern you are searching,\\nyou are asking a system to search through k^t many possible combinations and guess what: exponential \\nfunctions are scary. We have been advocating the integration of 2 specific techniques\\ninto the query processors of GDBMSs for several years now: (i) factorization; and (ii) worst-case optimal joins.\\nBoth of these techniques are specifically designed for \\nmany-to-many growing joins and we have integrated them in K\xf9zu. Stay tuned for for my next two posts on this. \\n\\n### Feature 3: Recursive Join Queries\\nThis is probably the most obvious feature where GDBMSs should excel in. First, objectively \\nthe query languages of GDBMSs have much better support\\nfor recursive join queries than SQL. Consider this query on our previous financial transaction network\\nexample: \\"Give me all direct or indirect money flows into Alice\'s account from Canada.\\" Now\\nlook at this elegant way to ask this in Cypher using the Kleene star \'\\\\*\':\\n```\\nMATCH (a:Account)-[:Transfer*]->(b:Account)\\nWHERE a.country = \'Canada\' and b.name = \'Alice\'\\nRETURN a.ID\\n```\\n\\nSimilar to regexes, \'\\\\*\' represents possible 1 or more repetitions of the Transfer\\nedge in the join. So the join could be a direct join between (a) and (b) or a 2-hop one,\\nor a 3-hop one etc. You can do this with SQL of course, but it\'s objectively harder. Recursion\\nhas been an afterthought when standardizing SQL. It came 20 years after SQL standardization started and is really a hack. \\nIn contrast, recursion has been first-class citizen\\nfeature in every graph-based DBMS\'s query language.\\nThis distinction is even much more visible\\nif you want to do other graph-specific recursive computation, such as finding shortest paths.\\nIn K\xf9zu, we are starting to work on implementing \\nand optimizing recursive query support and we hope to have first a basic version and \\nthen optimized versions that hopefully works very well and contributes to the principles of how these\\nqueries should be evaluated.\\n\\n### Feature 4: Schema Querying \\nAnother important feature of GDBMSs that cannot be done in\\nRDBMSs is that the query languages allow querying the schema of a database in addition\\nto the data in the database. Suppose in a modified financial transaction network, \\nthere are three relationship types: Wire, Deposit, and ETransfer and you \\nyou wanted to search for a path where the first edge and the second edge types\\nare different. Note that the predicate is *on the schema*, specifically on the type \\nof the nodes/relations. You can write the following query:\\n```\\nMATCH (a:Account)-[e1]->(b:Account)-[e2]->(c:Account)\\nWHERE type(e1) != type(e2)\\nRETURN *\\n```\\n\\nSomething akin to this cannot directly be done in SQL. One would have to write a query\\nthat unions many sub-queries: one that joins node records over Wire and then Deposit,\\nanother on Wire and ETransfer, another on Deposit and then Wire etc. This will be \\nmessy. The ability to *not* specify a label on relationships, \\nspecifically on e1 and e2, is an\\nelegant way to effectively express such unions of join queries.\\nIt says: \\"join a and b nodes over every possible relationship\\".\\nThe `type()` function on these variables allows doing querying over the schema.\\n\\n### Feature 5: Semi-structured Data and URI-heavy Datasets (e.g., \\"Knowledge Graphs\\")\\nAn important application domain of GDBMSs \\nis \\"knowledge graphs\\". This term means different things \\nin different contexts and I\'ll take it\\nto refer to highly heterogenous datasets that are\\noften naturally modeled as RDF triples. Again, I don\'t want to go into the \\ndetails of this model but I assume many readers will already be familiar with\\nRDF. RDF is a simple data model where data is represented as (subject, predicate, object)\\ntriples that represent facts about a domain. A great application is when modeling and\\nquerying encyclopedic facts, such as those extracted from Wikipedia data.\\nFor example, the following triple stores the fact\\nthat Justin Trudeau is married to Sophie Trudeau:\\n(http://dbpedia.org/resource/Justin_Trudeau, http://dbpedia.org/ontology/spouse,\\t\\nhttp://dbpedia.org/resource/Sophie_Gr\xe9goire_Trudeau). \\nThere are 2 immediate challenges for a DBMS to manage \\nsuch data: \\n1. Structuring such datasets is very difficult. Structuring here\\nrefers to designing a relational schema for the data.\\nEntities can have many types, e.g., Justin Trudeau is both a \\"rdf:type\\" \\nhttp://dbpedia.org/ontology/Person as well as\\nhttp://dbpedia.org/ontology/Politician. Further, within a single type, entities can have many different\\nand distinct properties, so good luck coming up with and maintaining a relational \\nschema for all that. \\nThis is a direct result of\\nthe overly ambitious domain the dataset is modeling: all encyclopedic human knowledge!\\nYou need a data model that allows flexibility in what can be associated with entities\\nand their types[^6].\\n\\n2. Those long strings used to identify entities, e.g., Justin\\nTrudea, are called URIs (for universal resource identifiers),\\nand queries will frequently access and specify them. So systems should\\nbe competent in handling those.\\n\\nGDBMSs tend to support semi-structured schemas and certainly RDF systems\\nhave good techniques to handle URIs. \\nThese applications are directly in the realm of graph-based DBMSs.\\nCurrently, they are directly targeted by RDF systems but I\'m convinced \\nGDBMSs should also implement techniques to efficiently support them[^7]. \\n\\n**Final note on the above feature set:** I referred to several classic applications but \\nmany other applications require and benefit\\nfrom the above feature set. One can\\nthink of the dataset and workloads of these applications as the \\"beyond relational/SQL\\" datasets/workloads, which\\noften require modeling and querying in a graph-based DBMS, and\\nwe want K\xf9zu to excel in and represent the state-of-art in this feature set! \\n\\n## K\xf9zu as a GDBMS for Graph Data Science Pipelines\\n\\nFinally, let me tell you a little bit about \\na particular application domain we are currently excited\\nabout and we want to see K\xf9zu used in: graph data science in the python ecosystem!\\nThis figure from my CIDR slides describes this vision pictorially:\\n\\n![K\xf9zu as a GDBMS for Graph Data Science Pipelines](./kuzu-as-gdbms-of-gds.png)\\n\\nSuppose you are building a graph analytics, machine learning, or visualization\\npipeline from raw record files on disk. You will want to model your raw records \\nas nodes and edges, clean them, extract features, query them, transform them, \\nand then you will extract data to an upstream python library, such as Pytorch Geometric, DGL, \\nNetworkX or a graph visualization library. \\nYou might even want a pipeline\\nthat extracts regular tables from your graphs to a tabular data science library, \\nsuch as NumPy,\\nsince the outputs of queries in Cypher are tables of records.\\nWe want people to use K\xf9zu as an embeddable library in their Python scripts, \\nto do their modeling, querying, feature extraction, \\ncleaning, and other transformations, all by benefiting from a high-level query language \\nand state-of-art graph data management techniques\\nthat we are implementing. This is exactly what DuckDB did for tabular data science/analytics.\\nWe are looking at DuckDB here and want to fill the same gap for graph data science/analytics!\\nWe are currently understanding the ecosystem better and appreciate feedback\\nand suggestions for features we should implement to enable your workloads.\\n\\nOK, this is it for now. In the next two blog posts, I will discuss \\nfactorization and worst-case optimal join algorithms and describe \\nsome of the principles that we adopted in K\xf9zu\'s query processor.\\nUntil then, happy new years from the cold but cozy winter of \ud83c\udde8\ud83c\udde6 \\nand [pip install kuzu](https://github.com/kuzudb/kuzu)!\\n\\n[^1]: Interestingly, Bachmann is one of a handful of Turing laureates without any academic career. If you love DBMSs, [listen to this talk](https://youtu.be/iDVsNqFEkB0) where he remembers his IDS days! Amusingly, he also talks about how he didn\'t know who Turing was when got the Turing award and how he met Turing\'s mother in England for tea \ud83d\ude00.\\n\\n[^2]: When I say GDBMSs here, I\'m referring to the core engines that implement the high-level languages of these systems and not the analytics libraries (e.g., [1](https://neo4j.com/product/graph-data-science/), [2](https://memgraph.com/mage)) above these core engines that run iterative graph analytics computations, such as finding connected components, or PageRank, or betweenness centrality. These computations are better understood through either direct graph formalisms or linear algebra (and not relational) operations.\\n\\n[^3]: I am a strong supporter of devoting a few lectures to GDBMSs after covering the fundamental topics on the relational model and RDBMSs in core introduction to DBMSs courses in undergraduate curriculums. Students should broaden their perspectives on the available data models and query/programming languages to them when they develop applications. GDBMSs is an obvious choice here. So is Datalog and RDF/SparQL.\\n\\n[^4]: We articulated this list of features in our CIDR 2023 paper. Incidentally, [a paper](https://www.cidrdb.org/cidr2023/papers/p66-wolde.pdf) written by CWI on a graph query extension to DuckDB, had a 12-item list of \\"techniques\\" that GDBMSs should implement at their cores. Let me call this the CWI list. These items are not features in the sense I\'m using the word, so I call them techniques. As you\'ll see my features are higher-level system properties from user\'s perspective. Peter Boncz, who is renowned in the field for having written or advised many successful DBMSs that spinned off, presented the CWI paper. I highly recommend this as another reading if you want to know more about Peter and his co-authors\' technical insights about how GDBMSs should be architected. Importantly, K\xf9zu has integrated or is in the process of integrating 11 of the 12 techniques in the CWI list(bulk path finding is the one we have to do more thinking on) and our prior publications had also articulated many of these insights, such as the fact that [GDBMSs should be columnar systems](https://www.vldb.org/pvldb/vol14/p2491-gupta.pdf) doing vectorized querying and of course we did a ton of work on [worst-case optimal joins](https://www.vldb.org/pvldb/vol12/p1692-mhedhbi.pdf) and [factorization](https://www.cidrdb.org/cidr2023/papers/p48-jin.pdf), which are also in the CWI list. I should acknowledge that Peter had been advocating for some of the techniques on the CWI list at least since 2018. I remember a presentation he gave in 2018 to GDBMSs researchers and developers titled \\"Why are Existing GDBMSs Incompetent?\\", which listed some of the techniques in the CWI list and visibly has inspired the title of this blog.\\n\\n[^5]: Although some refer to these as an \\"adjacency list index\\" because that\'s a common term in graph terminology, I need to pay my respects to the giants in the field: these are plain old [1980s Valduriez join indices](https://dl.acm.org/doi/abs/10.1145/22952.22955). And no, they were invented in the context of RDBMSs. That said, they never found much adoption in RDBMSs. But they are almost universally adopted in GDBMSs.\\n\\n[^6]: Designing the schema, i.e., defining the types of entities and relationships and class structures and constraints of such complex domains can be decades of work. What I\'m referring to as schema is called an \\"ontology\\" in knowledge graph/semantic web space. If you ever thought you modeled a hard application domain, take a look at [SNOMED](https://en.wikipedia.org/wiki/SNOMED_CT), which is a decades long effort to model and standardize human medical knowledge. Last term, I had a seminar on SNODEM in my graduate course on knowledge graphs and students were baffled by the complexity of this \\"ontology\\", which describes the types of entities and their relationships and constraints, which RDF technology stack is quite good at.\\n\\n[^7]: Before we released K\xf9zu, we had support for adding arbitrary node/edge properties but we removed a large chunk of code out of the system to release a thinner code base. So currently you need to specify a schema for your nodes and relationships in K\xf9zu. We will wait and see if/when that demand comes and how strongly it comes. We know from our conversations with many users and developers of GDBMSs over the years that most datasets in enterprises are not this complex and can be structured. At least after a proof of concept phase of applications, developers structure their data."},{"id":"meet-kuzu","metadata":{"permalink":"/docusaurus/blog/meet-kuzu","source":"@site/blog/2022-11-15-meet-kuzu.md","title":"Meet K\xf9zu \ud83e\udd17","description":"Today we are very excited to make an initial version of K\xf9zu public on github!","date":"2022-11-15T00:00:00.000Z","formattedDate":"November 15, 2022","tags":[{"label":"release","permalink":"/docusaurus/blog/tags/release"}],"readingTime":2.11,"hasTruncateMarker":true,"authors":[{"name":"K\xf9zu Team","url":"https://github.com/kuzudb/","imageURL":"https://kuzudb.com/img/blog/team.jpg","key":"team"}],"frontMatter":{"slug":"meet-kuzu","authors":["team"],"tags":["release"]},"prevItem":{"title":"What Every Competent GDBMS Should Do (aka The Goals & Vision of K\xf9zu","permalink":"/docusaurus/blog/what-every-gdbms-should-do-and-vision"}},"content":"Today we are very excited to make an initial version of [K\xf9zu public on github](https://github.com/kuzudb/kuzu)! \\nK\xf9zu is a new embeddable property graph database management system (GDBMS) that is \\ndesigned for high scalability and very fast querying. We are releasing \\nK\xf9zu today under a permissible MIT license. Through years of research on GDBMSs, we observed a lack of\\nhighly efficient GDBMS in the market that adopts state-of-the-art \\nquerying and storage techniques and that can very easily integrate into applications, \\nsimilar to DuckDB or SQLite. K\xf9zu aims to fill this space and evolve into the \\ngo-to open-source system to develop\\ngraph database applications, e.g., to manage and query your knowledge graphs, \\nand develop graph machine learning and analytics pipelines, \\ne.g., in the Python data science ecosystem. \\n\\n\x3c!--truncate--\x3e\\n\\nK\xf9zu\'s core architecture is informed by 6 years of research we conducted \\nat University of Waterloo on an earlier prototype GDBMS called [GraphflowDB](http://graphflow.io/). \\nUnlike GraphflowDB, which was intended to be a prototype for our research, K\xf9zu aims to be\\na usable feature-rich system. Some of the primary features of K\xf9zu\'s architecture are:\\n - Flexible Property Graph Data Model and Cypher query language\\n - Embeddable, serverless integration into applications\\n - Columnar disk-based storage\\n - Columnar sparse row-based (CSR) adjacency list/join indices\\n - Vectorized and factorized query processor\\n - Novel and very fast join algorithms\\n - Multi-core query parallelism\\n - Serializable ACID transactions\\n\\nWhat we are releasing today includes many of the features of the core engine. This is what we\\ncalled the \\"Phase 1\\" of the project. In the next \\"Phase 2\\" of the project, as we continue adding \\nmore features to the core engine, e.g., better support for ad-hoc properties, string compression,\\nand support for new recursive queries, we will also be focusing developing around the core engine\\nto more easily ingest data into the system and output data to downstream data science/graph data science\\nlibraries. You can keep an eye on our tentative [roadmap here](https://github.com/kuzudb/kuzu/issues/981). \\nYou can also read more about some of our longer term goals and vision as a system\\nin [our new CIDR 2023 paper](https://cs.uwaterloo.ca/~ssalihog/papers/kuzu-tr.pdf), \\nwhich we will present in Amsterdam next January. \\n\\n*And most importantly please start using K\xf9zu, tell us your feature requests, use cases, and report bugs. We can evolve into a\\nmore stable, usable, and feature-rich system only through your feedback!* \\n\\nWe are looking forward to to your feedback and a long and exciting journey as we continue developing K\xf9zu \ud83e\udd17. \\n\\n*ps: For interested readers: the word k\xf9-zu is the Sumerian (the oldest known human language) word for \\"wisdom\\".*"}]}')}}]); \ No newline at end of file diff --git a/docusaurus/assets/js/runtime~main.4f37edbd.js b/docusaurus/assets/js/runtime~main.68f5e22c.js similarity index 98% rename from docusaurus/assets/js/runtime~main.4f37edbd.js rename to docusaurus/assets/js/runtime~main.68f5e22c.js index f3c6d72be..58550225c 100644 --- a/docusaurus/assets/js/runtime~main.4f37edbd.js +++ b/docusaurus/assets/js/runtime~main.68f5e22c.js @@ -1 +1 @@ -(()=>{"use strict";var e,a,c,b,f,d={},t={};function r(e){var a=t[e];if(void 0!==a)return a.exports;var c=t[e]={exports:{}};return d[e].call(c.exports,c,c.exports,r),c.exports}r.m=d,e=[],r.O=(a,c,b,f)=>{if(!c){var d=1/0;for(i=0;i=f)&&Object.keys(r.O).every((e=>r.O[e](c[o])))?c.splice(o--,1):(t=!1,f0&&e[i-1][2]>f;i--)e[i]=e[i-1];e[i]=[c,b,f]},r.n=e=>{var a=e&&e.__esModule?()=>e.default:()=>e;return r.d(a,{a:a}),a},c=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,b){if(1&b&&(e=this(e)),8&b)return e;if("object"==typeof e&&e){if(4&b&&e.__esModule)return e;if(16&b&&"function"==typeof e.then)return e}var f=Object.create(null);r.r(f);var d={};a=a||[null,c({}),c([]),c(c)];for(var t=2&b&&e;"object"==typeof t&&!~a.indexOf(t);t=c(t))Object.getOwnPropertyNames(t).forEach((a=>d[a]=()=>e[a]));return d.default=()=>e,r.d(f,d),f},r.d=(e,a)=>{for(var c in a)r.o(a,c)&&!r.o(e,c)&&Object.defineProperty(e,c,{enumerable:!0,get:a[c]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((a,c)=>(r.f[c](e,a),a)),[])),r.u=e=>"assets/js/"+({53:"935f2afb",157:"ba3a79a7",187:"14394921",413:"6a9aaa03",541:"e1873644",568:"2bb74d03",743:"e3b75a28",768:"60322c79",774:"680c6196",777:"ac932941",813:"5a5243b8",915:"38f6e282",969:"cb288627",1045:"733349e1",1130:"97ecaa70",1242:"e82e3918",1267:"b6b94609",1275:"421846b6",1392:"81ecf48f",1424:"1099da71",1426:"09e31552",1572:"5d9e877a",1593:"206b0986",1613:"bf001a2b",1628:"50a61d48",1733:"860cdaa1",1871:"46cdbd44",1881:"766ebba1",1920:"5e031480",1965:"01a12966",2053:"d5ac0750",2161:"7272fd22",2255:"ae022e6e",2293:"ef1edfe5",2392:"140a1541",2443:"96061b27",2535:"814f3328",2563:"7c1ce3b5",2611:"b48352fa",2618:"bf0d52dc",2753:"3fd57b1b",2757:"b5e2882b",2953:"521f3440",2958:"a59adfdb",3085:"1f391b9e",3089:"a6aa9e1f",3093:"50e4e898",3128:"f0d9f96b",3137:"ea9ac021",3208:"65db24e6",3213:"5b866286",3217:"3b8c55ea",3353:"cac7aedd",3536:"a40e6f53",3608:"9e4087bc",3717:"0ae8bb96",3766:"7298f630",3793:"48284b96",3943:"beadeff4",4013:"01a85c17",4024:"e0fb7a76",4071:"9c1d8c5d",4087:"8da1f996",4195:"c4f5d8e4",4201:"4feabb6b",4281:"0b62a8cb",4299:"6e4c703c",4325:"ca893e4c",4394:"17a2b2f4",4428:"377613b6",4435:"b7c79da1",4467:"3b6ebf87",4517:"a14cea2e",4668:"3eae62bb",4699:"ce9a3510",4713:"93e1f41b",4816:"7ca86fab",4922:"e506623f",5003:"f0c051b5",5042:"a83145c4",5149:"71e11f7b",5419:"35f763da",5452:"e34dde7f",5458:"72c409f7",5488:"d10979fd",5496:"9d2cf940",5608:"738e78f6",5618:"84488bce",5621:"8149678f",5629:"70f55a58",5721:"e9303b53",5730:"196c648d",5873:"292a88ab",6007:"a1f7b462",6100:"76225974",6103:"ccc49370",6206:"abc27f4f",6228:"170a097a",6245:"d3d33077",6328:"ab2d4c4e",6329:"54c82979",6338:"517ab754",6339:"9fd82ed5",6375:"24b17123",6499:"e760c2da",6568:"bc61cf7b",6575:"2c44959e",6648:"d950cab9",6725:"12892875",6726:"6d69c0e6",6798:"2b56989c",6920:"600b14f4",6935:"f5fe6467",6980:"b288c6c0",7007:"c4f57f6b",7083:"3552a42a",7131:"8439a13f",7286:"b60b3870",7318:"c00cc653",7414:"393be207",7448:"15d439f7",7584:"45e84ef6",7670:"2fee3f72",7808:"c03b8e47",7812:"2454f074",7822:"74ce1caf",7832:"dfbda8c6",7911:"0ce32e27",7915:"54c0b49f",7918:"17896441",7920:"1a4e3797",7938:"4cd00e1e",7954:"2179fbec",7980:"9a80955e",7990:"eec169e3",8061:"52ae103f",8087:"95917782",8247:"84f491d1",8307:"1c414186",8314:"c261d968",8378:"4dbb5cef",8413:"014961b5",8457:"d98bb358",8481:"8dd38787",8506:"1c93e70c",8540:"db27caff",8543:"4ea2d68f",8610:"6875c492",8705:"136ef751",8740:"00420783",8792:"486eabc2",8808:"d83b94ca",8986:"7f9f8af7",8998:"c25224f3",9020:"62a2d561",9089:"a816b6ed",9099:"0a63d782",9163:"c67164f2",9202:"b2882d63",9236:"c8703b92",9295:"c138efc9",9308:"0d56c7e0",9321:"9e1b3fde",9330:"2202aca0",9342:"443a3e0b",9417:"87c056f1",9486:"bcc3466e",9514:"1be78505",9524:"9ef6215d",9545:"e3a34105",9574:"50f245fe",9600:"2ee28b18",9628:"fa4d8017",9642:"2a3755a4",9650:"6224aa6e",9679:"2ca2b673",9738:"4c663521",9817:"14eb3368",9867:"68afb7ee",9998:"9bfe107e"}[e]||e)+"."+{53:"31270f88",157:"0548357c",187:"63ee3ddf",413:"50e443c4",541:"79f7cc4a",568:"946a5aa8",743:"277527ba",768:"99c848f4",774:"7f9f2229",777:"8b6990b4",813:"873daa33",915:"b44bc145",969:"e32da5a1",1045:"04fea0ce",1130:"52b5bce3",1242:"43298e93",1267:"6551c492",1275:"22c2d533",1392:"eef7e8ae",1424:"343987b4",1426:"bd368343",1572:"f1303699",1593:"f3c2cf05",1613:"55f47980",1628:"2b24a8e2",1733:"ceb7c2ac",1871:"ce4ef45d",1881:"81b27725",1920:"22bace62",1965:"9482e834",2053:"9ef6e43a",2161:"0960a62e",2255:"35618ea1",2293:"c554096b",2392:"9be18417",2443:"7d630201",2529:"631df343",2535:"e0d6a87a",2563:"c280c55c",2611:"70d4c834",2618:"8380fee8",2753:"293dd476",2757:"9cbbf00b",2953:"5df4145d",2958:"0aa99093",3085:"529b20a1",3089:"2ef77735",3093:"70b2bccd",3128:"37538281",3137:"c4411917",3208:"66a90f38",3213:"3c2a739f",3217:"952c812d",3353:"96129409",3536:"356f328b",3608:"14fbac47",3717:"84edbc49",3766:"4025eb40",3793:"f62581f8",3943:"63d91510",4013:"7cef694c",4024:"e4b9accb",4071:"304e4ebc",4087:"09286f36",4195:"3c56c6d8",4201:"f69897ac",4281:"ec2f46ca",4299:"cd0a5eec",4325:"782137ac",4394:"03da8bf7",4428:"b773e38c",4435:"800c306c",4467:"7247b323",4517:"46ad6723",4668:"26b3370f",4699:"2507f939",4713:"c4c24fee",4816:"d6591275",4922:"83c9d7bd",4972:"81dca464",5003:"c7f1af6f",5042:"16660408",5149:"874f8cad",5419:"5110167a",5452:"e90ce975",5458:"9cc9fec0",5488:"415d092e",5496:"047b8e80",5608:"878482be",5618:"4eb5ea06",5621:"d5b8110e",5629:"a6cb49da",5721:"12dd0895",5730:"18e8c522",5873:"fccbbbc3",6007:"de586378",6100:"10a467a2",6103:"17c80ef1",6206:"e727cadb",6228:"24f53033",6245:"2b751631",6328:"310363f2",6329:"c8defe66",6338:"4d7e33fb",6339:"24af9556",6375:"b063dd9a",6499:"0250dd1d",6568:"43d5fc58",6575:"4e3d2db9",6648:"cdfc9d76",6672:"09a9911e",6725:"4a6841c1",6726:"ed90f9cf",6798:"7c0518fa",6920:"44bb60a5",6935:"b9ce7ba2",6945:"6b5ed935",6980:"0f8fcd4a",7007:"55f2c314",7083:"fd8a42f9",7131:"c6e535b4",7286:"51e728df",7318:"79506202",7414:"a95f6891",7448:"3d8dd7ed",7584:"a7b81da1",7670:"faaddfb9",7808:"4fd40e95",7812:"ad88893d",7822:"4a980b7d",7832:"5c97d8ad",7911:"0457add0",7915:"4fe4f984",7918:"d4d7071c",7920:"ebb58ea4",7938:"fd0eea33",7954:"c061f84b",7980:"87972128",7990:"54ec64d0",8061:"45a029f0",8087:"859b774e",8247:"73eec081",8307:"97326d83",8314:"7392c4c4",8378:"6c1b9298",8413:"301b52fd",8457:"82985411",8481:"8fa71c18",8506:"ec4f496f",8540:"9a311858",8543:"2a5bf9c4",8610:"bf5bfe2b",8705:"b4425820",8718:"4664a626",8740:"c6603f1f",8792:"4a1f22e4",8808:"38e9fc21",8894:"bba49498",8986:"1d846aa7",8998:"5ac2e2dc",9020:"b381762f",9089:"21a120ea",9099:"ce9f5221",9163:"975d3b63",9202:"9763cd91",9236:"cd4ac66b",9295:"ce2a7863",9308:"a5ec27fb",9321:"5559e988",9330:"921ff3d4",9342:"f4cf1d1e",9417:"7555aa7b",9486:"51cdfdfe",9514:"57e490d5",9524:"365de8ae",9545:"879d0c09",9574:"3d1750e8",9600:"563fc4a2",9628:"54fe438e",9642:"1a091192",9650:"bd82cf1f",9679:"bdc8b605",9738:"13ab307e",9817:"a1667fa6",9867:"860398e8",9998:"c6fb4784"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,a)=>Object.prototype.hasOwnProperty.call(e,a),b={},f="kuzu-docs:",r.l=(e,a,c,d)=>{if(b[e])b[e].push(a);else{var t,o;if(void 0!==c)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var f=b[e];if(delete b[e],t.parentNode&&t.parentNode.removeChild(t),f&&f.forEach((e=>e(c))),a)return a(c)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/docusaurus/",r.gca=function(e){return e={12892875:"6725",14394921:"187",17896441:"7918",76225974:"6100",95917782:"8087","935f2afb":"53",ba3a79a7:"157","6a9aaa03":"413",e1873644:"541","2bb74d03":"568",e3b75a28:"743","60322c79":"768","680c6196":"774",ac932941:"777","5a5243b8":"813","38f6e282":"915",cb288627:"969","733349e1":"1045","97ecaa70":"1130",e82e3918:"1242",b6b94609:"1267","421846b6":"1275","81ecf48f":"1392","1099da71":"1424","09e31552":"1426","5d9e877a":"1572","206b0986":"1593",bf001a2b:"1613","50a61d48":"1628","860cdaa1":"1733","46cdbd44":"1871","766ebba1":"1881","5e031480":"1920","01a12966":"1965",d5ac0750:"2053","7272fd22":"2161",ae022e6e:"2255",ef1edfe5:"2293","140a1541":"2392","96061b27":"2443","814f3328":"2535","7c1ce3b5":"2563",b48352fa:"2611",bf0d52dc:"2618","3fd57b1b":"2753",b5e2882b:"2757","521f3440":"2953",a59adfdb:"2958","1f391b9e":"3085",a6aa9e1f:"3089","50e4e898":"3093",f0d9f96b:"3128",ea9ac021:"3137","65db24e6":"3208","5b866286":"3213","3b8c55ea":"3217",cac7aedd:"3353",a40e6f53:"3536","9e4087bc":"3608","0ae8bb96":"3717","7298f630":"3766","48284b96":"3793",beadeff4:"3943","01a85c17":"4013",e0fb7a76:"4024","9c1d8c5d":"4071","8da1f996":"4087",c4f5d8e4:"4195","4feabb6b":"4201","0b62a8cb":"4281","6e4c703c":"4299",ca893e4c:"4325","17a2b2f4":"4394","377613b6":"4428",b7c79da1:"4435","3b6ebf87":"4467",a14cea2e:"4517","3eae62bb":"4668",ce9a3510:"4699","93e1f41b":"4713","7ca86fab":"4816",e506623f:"4922",f0c051b5:"5003",a83145c4:"5042","71e11f7b":"5149","35f763da":"5419",e34dde7f:"5452","72c409f7":"5458",d10979fd:"5488","9d2cf940":"5496","738e78f6":"5608","84488bce":"5618","8149678f":"5621","70f55a58":"5629",e9303b53:"5721","196c648d":"5730","292a88ab":"5873",a1f7b462:"6007",ccc49370:"6103",abc27f4f:"6206","170a097a":"6228",d3d33077:"6245",ab2d4c4e:"6328","54c82979":"6329","517ab754":"6338","9fd82ed5":"6339","24b17123":"6375",e760c2da:"6499",bc61cf7b:"6568","2c44959e":"6575",d950cab9:"6648","6d69c0e6":"6726","2b56989c":"6798","600b14f4":"6920",f5fe6467:"6935",b288c6c0:"6980",c4f57f6b:"7007","3552a42a":"7083","8439a13f":"7131",b60b3870:"7286",c00cc653:"7318","393be207":"7414","15d439f7":"7448","45e84ef6":"7584","2fee3f72":"7670",c03b8e47:"7808","2454f074":"7812","74ce1caf":"7822",dfbda8c6:"7832","0ce32e27":"7911","54c0b49f":"7915","1a4e3797":"7920","4cd00e1e":"7938","2179fbec":"7954","9a80955e":"7980",eec169e3:"7990","52ae103f":"8061","84f491d1":"8247","1c414186":"8307",c261d968:"8314","4dbb5cef":"8378","014961b5":"8413",d98bb358:"8457","8dd38787":"8481","1c93e70c":"8506",db27caff:"8540","4ea2d68f":"8543","6875c492":"8610","136ef751":"8705","00420783":"8740","486eabc2":"8792",d83b94ca:"8808","7f9f8af7":"8986",c25224f3:"8998","62a2d561":"9020",a816b6ed:"9089","0a63d782":"9099",c67164f2:"9163",b2882d63:"9202",c8703b92:"9236",c138efc9:"9295","0d56c7e0":"9308","9e1b3fde":"9321","2202aca0":"9330","443a3e0b":"9342","87c056f1":"9417",bcc3466e:"9486","1be78505":"9514","9ef6215d":"9524",e3a34105:"9545","50f245fe":"9574","2ee28b18":"9600",fa4d8017:"9628","2a3755a4":"9642","6224aa6e":"9650","2ca2b673":"9679","4c663521":"9738","14eb3368":"9817","68afb7ee":"9867","9bfe107e":"9998"}[e]||e,r.p+r.u(e)},(()=>{var e={1303:0,532:0};r.f.j=(a,c)=>{var b=r.o(e,a)?e[a]:void 0;if(0!==b)if(b)c.push(b[2]);else if(/^(1303|532)$/.test(a))e[a]=0;else{var f=new Promise(((c,f)=>b=e[a]=[c,f]));c.push(b[2]=f);var d=r.p+r.u(a),t=new Error;r.l(d,(c=>{if(r.o(e,a)&&(0!==(b=e[a])&&(e[a]=void 0),b)){var f=c&&("load"===c.type?"missing":c.type),d=c&&c.target&&c.target.src;t.message="Loading chunk "+a+" failed.\n("+f+": "+d+")",t.name="ChunkLoadError",t.type=f,t.request=d,b[1](t)}}),"chunk-"+a,a)}},r.O.j=a=>0===e[a];var a=(a,c)=>{var b,f,d=c[0],t=c[1],o=c[2],n=0;if(d.some((a=>0!==e[a]))){for(b in t)r.o(t,b)&&(r.m[b]=t[b]);if(o)var i=o(r)}for(a&&a(c);n{"use strict";var e,a,c,b,f,d={},t={};function r(e){var a=t[e];if(void 0!==a)return a.exports;var c=t[e]={exports:{}};return d[e].call(c.exports,c,c.exports,r),c.exports}r.m=d,e=[],r.O=(a,c,b,f)=>{if(!c){var d=1/0;for(i=0;i=f)&&Object.keys(r.O).every((e=>r.O[e](c[o])))?c.splice(o--,1):(t=!1,f0&&e[i-1][2]>f;i--)e[i]=e[i-1];e[i]=[c,b,f]},r.n=e=>{var a=e&&e.__esModule?()=>e.default:()=>e;return r.d(a,{a:a}),a},c=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,b){if(1&b&&(e=this(e)),8&b)return e;if("object"==typeof e&&e){if(4&b&&e.__esModule)return e;if(16&b&&"function"==typeof e.then)return e}var f=Object.create(null);r.r(f);var d={};a=a||[null,c({}),c([]),c(c)];for(var t=2&b&&e;"object"==typeof t&&!~a.indexOf(t);t=c(t))Object.getOwnPropertyNames(t).forEach((a=>d[a]=()=>e[a]));return d.default=()=>e,r.d(f,d),f},r.d=(e,a)=>{for(var c in a)r.o(a,c)&&!r.o(e,c)&&Object.defineProperty(e,c,{enumerable:!0,get:a[c]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((a,c)=>(r.f[c](e,a),a)),[])),r.u=e=>"assets/js/"+({53:"935f2afb",157:"ba3a79a7",187:"14394921",413:"6a9aaa03",541:"e1873644",568:"2bb74d03",743:"e3b75a28",768:"60322c79",774:"680c6196",777:"ac932941",813:"5a5243b8",915:"38f6e282",969:"cb288627",1045:"733349e1",1130:"97ecaa70",1242:"e82e3918",1267:"b6b94609",1275:"421846b6",1392:"81ecf48f",1424:"1099da71",1426:"09e31552",1572:"5d9e877a",1593:"206b0986",1613:"bf001a2b",1628:"50a61d48",1733:"860cdaa1",1871:"46cdbd44",1881:"766ebba1",1920:"5e031480",1965:"01a12966",2053:"d5ac0750",2161:"7272fd22",2255:"ae022e6e",2293:"ef1edfe5",2392:"140a1541",2443:"96061b27",2535:"814f3328",2563:"7c1ce3b5",2611:"b48352fa",2618:"bf0d52dc",2753:"3fd57b1b",2757:"b5e2882b",2953:"521f3440",2958:"a59adfdb",3085:"1f391b9e",3089:"a6aa9e1f",3093:"50e4e898",3128:"f0d9f96b",3137:"ea9ac021",3208:"65db24e6",3213:"5b866286",3217:"3b8c55ea",3353:"cac7aedd",3536:"a40e6f53",3608:"9e4087bc",3717:"0ae8bb96",3766:"7298f630",3793:"48284b96",3943:"beadeff4",4013:"01a85c17",4024:"e0fb7a76",4071:"9c1d8c5d",4087:"8da1f996",4195:"c4f5d8e4",4201:"4feabb6b",4281:"0b62a8cb",4299:"6e4c703c",4325:"ca893e4c",4394:"17a2b2f4",4428:"377613b6",4435:"b7c79da1",4467:"3b6ebf87",4517:"a14cea2e",4668:"3eae62bb",4699:"ce9a3510",4713:"93e1f41b",4816:"7ca86fab",4922:"e506623f",5003:"f0c051b5",5042:"a83145c4",5149:"71e11f7b",5419:"35f763da",5452:"e34dde7f",5458:"72c409f7",5488:"d10979fd",5496:"9d2cf940",5608:"738e78f6",5618:"84488bce",5621:"8149678f",5629:"70f55a58",5721:"e9303b53",5730:"196c648d",5873:"292a88ab",6007:"a1f7b462",6100:"76225974",6103:"ccc49370",6206:"abc27f4f",6228:"170a097a",6245:"d3d33077",6328:"ab2d4c4e",6329:"54c82979",6338:"517ab754",6339:"9fd82ed5",6375:"24b17123",6499:"e760c2da",6568:"bc61cf7b",6575:"2c44959e",6648:"d950cab9",6725:"12892875",6726:"6d69c0e6",6798:"2b56989c",6920:"600b14f4",6935:"f5fe6467",6980:"b288c6c0",7007:"c4f57f6b",7083:"3552a42a",7131:"8439a13f",7286:"b60b3870",7318:"c00cc653",7414:"393be207",7448:"15d439f7",7584:"45e84ef6",7670:"2fee3f72",7808:"c03b8e47",7812:"2454f074",7822:"74ce1caf",7832:"dfbda8c6",7911:"0ce32e27",7915:"54c0b49f",7918:"17896441",7920:"1a4e3797",7938:"4cd00e1e",7954:"2179fbec",7980:"9a80955e",7990:"eec169e3",8061:"52ae103f",8087:"95917782",8247:"84f491d1",8307:"1c414186",8314:"c261d968",8378:"4dbb5cef",8413:"014961b5",8457:"d98bb358",8481:"8dd38787",8506:"1c93e70c",8540:"db27caff",8543:"4ea2d68f",8610:"6875c492",8705:"136ef751",8740:"00420783",8792:"486eabc2",8808:"d83b94ca",8986:"7f9f8af7",8998:"c25224f3",9020:"62a2d561",9089:"a816b6ed",9099:"0a63d782",9163:"c67164f2",9202:"b2882d63",9236:"c8703b92",9295:"c138efc9",9308:"0d56c7e0",9321:"9e1b3fde",9330:"2202aca0",9342:"443a3e0b",9417:"87c056f1",9486:"bcc3466e",9514:"1be78505",9524:"9ef6215d",9545:"e3a34105",9574:"50f245fe",9600:"2ee28b18",9628:"fa4d8017",9642:"2a3755a4",9650:"6224aa6e",9679:"2ca2b673",9738:"4c663521",9817:"14eb3368",9867:"68afb7ee",9998:"9bfe107e"}[e]||e)+"."+{53:"31270f88",157:"0548357c",187:"63ee3ddf",413:"50e443c4",541:"79f7cc4a",568:"946a5aa8",743:"277527ba",768:"99c848f4",774:"7f9f2229",777:"8b6990b4",813:"873daa33",915:"b44bc145",969:"e32da5a1",1045:"04fea0ce",1130:"52b5bce3",1242:"43298e93",1267:"6551c492",1275:"22c2d533",1392:"eef7e8ae",1424:"343987b4",1426:"bd368343",1572:"f1303699",1593:"f3c2cf05",1613:"55f47980",1628:"2b24a8e2",1733:"ceb7c2ac",1871:"ce4ef45d",1881:"81b27725",1920:"22bace62",1965:"9482e834",2053:"9ef6e43a",2161:"0960a62e",2255:"35618ea1",2293:"c554096b",2392:"9be18417",2443:"7d630201",2529:"631df343",2535:"e0d6a87a",2563:"c280c55c",2611:"70d4c834",2618:"8380fee8",2753:"293dd476",2757:"9cbbf00b",2953:"5df4145d",2958:"0aa99093",3085:"529b20a1",3089:"2ef77735",3093:"70b2bccd",3128:"37538281",3137:"c4411917",3208:"66a90f38",3213:"3c2a739f",3217:"952c812d",3353:"96129409",3536:"356f328b",3608:"14fbac47",3717:"84edbc49",3766:"4025eb40",3793:"f62581f8",3943:"2e3dfba8",4013:"7cef694c",4024:"e4b9accb",4071:"304e4ebc",4087:"09286f36",4195:"3c56c6d8",4201:"f69897ac",4281:"ec2f46ca",4299:"cd0a5eec",4325:"782137ac",4394:"03da8bf7",4428:"b773e38c",4435:"800c306c",4467:"7247b323",4517:"46ad6723",4668:"26b3370f",4699:"2507f939",4713:"c4c24fee",4816:"d6591275",4922:"9ae97683",4972:"81dca464",5003:"c7f1af6f",5042:"16660408",5149:"874f8cad",5419:"5110167a",5452:"e90ce975",5458:"9cc9fec0",5488:"415d092e",5496:"047b8e80",5608:"878482be",5618:"4eb5ea06",5621:"d5b8110e",5629:"a6cb49da",5721:"12dd0895",5730:"18e8c522",5873:"fccbbbc3",6007:"de586378",6100:"10a467a2",6103:"17c80ef1",6206:"e727cadb",6228:"24f53033",6245:"2b751631",6328:"310363f2",6329:"c8defe66",6338:"4d7e33fb",6339:"24af9556",6375:"b063dd9a",6499:"0250dd1d",6568:"43d5fc58",6575:"4e3d2db9",6648:"cdfc9d76",6672:"09a9911e",6725:"4a6841c1",6726:"ed90f9cf",6798:"7c0518fa",6920:"44bb60a5",6935:"b9ce7ba2",6945:"6b5ed935",6980:"0f8fcd4a",7007:"55f2c314",7083:"fd8a42f9",7131:"c361c36e",7286:"51e728df",7318:"79506202",7414:"a95f6891",7448:"3d8dd7ed",7584:"a7b81da1",7670:"faaddfb9",7808:"4fd40e95",7812:"ad88893d",7822:"4a980b7d",7832:"5c97d8ad",7911:"0457add0",7915:"4fe4f984",7918:"d4d7071c",7920:"ebb58ea4",7938:"fd0eea33",7954:"c061f84b",7980:"87972128",7990:"54ec64d0",8061:"45a029f0",8087:"859b774e",8247:"73eec081",8307:"97326d83",8314:"7392c4c4",8378:"6c1b9298",8413:"301b52fd",8457:"82985411",8481:"8fa71c18",8506:"ec4f496f",8540:"9a311858",8543:"2a5bf9c4",8610:"bf5bfe2b",8705:"b4425820",8718:"4664a626",8740:"c6603f1f",8792:"4a1f22e4",8808:"38e9fc21",8894:"bba49498",8986:"1d846aa7",8998:"5ac2e2dc",9020:"b381762f",9089:"21a120ea",9099:"ce9f5221",9163:"975d3b63",9202:"9763cd91",9236:"cd4ac66b",9295:"ce2a7863",9308:"a5ec27fb",9321:"5559e988",9330:"921ff3d4",9342:"f4cf1d1e",9417:"7555aa7b",9486:"51cdfdfe",9514:"57e490d5",9524:"365de8ae",9545:"879d0c09",9574:"3d1750e8",9600:"563fc4a2",9628:"54fe438e",9642:"1a091192",9650:"bd82cf1f",9679:"bdc8b605",9738:"13ab307e",9817:"a1667fa6",9867:"860398e8",9998:"c6fb4784"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,a)=>Object.prototype.hasOwnProperty.call(e,a),b={},f="kuzu-docs:",r.l=(e,a,c,d)=>{if(b[e])b[e].push(a);else{var t,o;if(void 0!==c)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var f=b[e];if(delete b[e],t.parentNode&&t.parentNode.removeChild(t),f&&f.forEach((e=>e(c))),a)return a(c)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/docusaurus/",r.gca=function(e){return e={12892875:"6725",14394921:"187",17896441:"7918",76225974:"6100",95917782:"8087","935f2afb":"53",ba3a79a7:"157","6a9aaa03":"413",e1873644:"541","2bb74d03":"568",e3b75a28:"743","60322c79":"768","680c6196":"774",ac932941:"777","5a5243b8":"813","38f6e282":"915",cb288627:"969","733349e1":"1045","97ecaa70":"1130",e82e3918:"1242",b6b94609:"1267","421846b6":"1275","81ecf48f":"1392","1099da71":"1424","09e31552":"1426","5d9e877a":"1572","206b0986":"1593",bf001a2b:"1613","50a61d48":"1628","860cdaa1":"1733","46cdbd44":"1871","766ebba1":"1881","5e031480":"1920","01a12966":"1965",d5ac0750:"2053","7272fd22":"2161",ae022e6e:"2255",ef1edfe5:"2293","140a1541":"2392","96061b27":"2443","814f3328":"2535","7c1ce3b5":"2563",b48352fa:"2611",bf0d52dc:"2618","3fd57b1b":"2753",b5e2882b:"2757","521f3440":"2953",a59adfdb:"2958","1f391b9e":"3085",a6aa9e1f:"3089","50e4e898":"3093",f0d9f96b:"3128",ea9ac021:"3137","65db24e6":"3208","5b866286":"3213","3b8c55ea":"3217",cac7aedd:"3353",a40e6f53:"3536","9e4087bc":"3608","0ae8bb96":"3717","7298f630":"3766","48284b96":"3793",beadeff4:"3943","01a85c17":"4013",e0fb7a76:"4024","9c1d8c5d":"4071","8da1f996":"4087",c4f5d8e4:"4195","4feabb6b":"4201","0b62a8cb":"4281","6e4c703c":"4299",ca893e4c:"4325","17a2b2f4":"4394","377613b6":"4428",b7c79da1:"4435","3b6ebf87":"4467",a14cea2e:"4517","3eae62bb":"4668",ce9a3510:"4699","93e1f41b":"4713","7ca86fab":"4816",e506623f:"4922",f0c051b5:"5003",a83145c4:"5042","71e11f7b":"5149","35f763da":"5419",e34dde7f:"5452","72c409f7":"5458",d10979fd:"5488","9d2cf940":"5496","738e78f6":"5608","84488bce":"5618","8149678f":"5621","70f55a58":"5629",e9303b53:"5721","196c648d":"5730","292a88ab":"5873",a1f7b462:"6007",ccc49370:"6103",abc27f4f:"6206","170a097a":"6228",d3d33077:"6245",ab2d4c4e:"6328","54c82979":"6329","517ab754":"6338","9fd82ed5":"6339","24b17123":"6375",e760c2da:"6499",bc61cf7b:"6568","2c44959e":"6575",d950cab9:"6648","6d69c0e6":"6726","2b56989c":"6798","600b14f4":"6920",f5fe6467:"6935",b288c6c0:"6980",c4f57f6b:"7007","3552a42a":"7083","8439a13f":"7131",b60b3870:"7286",c00cc653:"7318","393be207":"7414","15d439f7":"7448","45e84ef6":"7584","2fee3f72":"7670",c03b8e47:"7808","2454f074":"7812","74ce1caf":"7822",dfbda8c6:"7832","0ce32e27":"7911","54c0b49f":"7915","1a4e3797":"7920","4cd00e1e":"7938","2179fbec":"7954","9a80955e":"7980",eec169e3:"7990","52ae103f":"8061","84f491d1":"8247","1c414186":"8307",c261d968:"8314","4dbb5cef":"8378","014961b5":"8413",d98bb358:"8457","8dd38787":"8481","1c93e70c":"8506",db27caff:"8540","4ea2d68f":"8543","6875c492":"8610","136ef751":"8705","00420783":"8740","486eabc2":"8792",d83b94ca:"8808","7f9f8af7":"8986",c25224f3:"8998","62a2d561":"9020",a816b6ed:"9089","0a63d782":"9099",c67164f2:"9163",b2882d63:"9202",c8703b92:"9236",c138efc9:"9295","0d56c7e0":"9308","9e1b3fde":"9321","2202aca0":"9330","443a3e0b":"9342","87c056f1":"9417",bcc3466e:"9486","1be78505":"9514","9ef6215d":"9524",e3a34105:"9545","50f245fe":"9574","2ee28b18":"9600",fa4d8017:"9628","2a3755a4":"9642","6224aa6e":"9650","2ca2b673":"9679","4c663521":"9738","14eb3368":"9817","68afb7ee":"9867","9bfe107e":"9998"}[e]||e,r.p+r.u(e)},(()=>{var e={1303:0,532:0};r.f.j=(a,c)=>{var b=r.o(e,a)?e[a]:void 0;if(0!==b)if(b)c.push(b[2]);else if(/^(1303|532)$/.test(a))e[a]=0;else{var f=new Promise(((c,f)=>b=e[a]=[c,f]));c.push(b[2]=f);var d=r.p+r.u(a),t=new Error;r.l(d,(c=>{if(r.o(e,a)&&(0!==(b=e[a])&&(e[a]=void 0),b)){var f=c&&("load"===c.type?"missing":c.type),d=c&&c.target&&c.target.src;t.message="Loading chunk "+a+" failed.\n("+f+": "+d+")",t.name="ChunkLoadError",t.type=f,t.request=d,b[1](t)}}),"chunk-"+a,a)}},r.O.j=a=>0===e[a];var a=(a,c)=>{var b,f,d=c[0],t=c[1],o=c[2],n=0;if(d.some((a=>0!==e[a]))){for(b in t)r.o(t,b)&&(r.m[b]=t[b]);if(o)var i=o(r)}for(a&&a(c);n