From b455626a14f115ba1061c5565c6fa2b5df760e8b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 8 Oct 2024 14:59:34 +0200 Subject: [PATCH 01/30] Add the required workflow files... Signed-off-by: Johannes Kalmbach --- .github/workflows/sparql-conformance.yml | 91 +++++++++++++++++++ .../workflows/upload-sparql-conformance.yml | 61 +++++++++++++ 2 files changed, 152 insertions(+) create mode 100644 .github/workflows/sparql-conformance.yml create mode 100644 .github/workflows/upload-sparql-conformance.yml diff --git a/.github/workflows/sparql-conformance.yml b/.github/workflows/sparql-conformance.yml new file mode 100644 index 0000000000..53c747f49c --- /dev/null +++ b/.github/workflows/sparql-conformance.yml @@ -0,0 +1,91 @@ +name: sparql-test-suite + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + merge_group: + +jobs: + build: + env: + compiler: clang + compiler-version: 16 + build-type: Release + cmake-flags: "-DCMAKE_C_COMPILER=clang-16 -DCMAKE_CXX_COMPILER=clang++-16" + + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v3 + with: + submodules: "recursive" + path: qlever-code + - name: Checkout sparql-test-suite-files + uses: actions/checkout@v3 + with: + repository: "w3c/rdf-tests" + path: sparql-test-suite + - name: Checkout qlever-test-suite + uses: actions/checkout@v3 + with: + repository: "SIRDNARch/qlever-conformance-tests" + token: ${{ secrets.CONFORMANCE_REPO_ACCESS_TOKEN }} + path: qlever-test-suite + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install python dependencies + run: | + python -m pip install --upgrade pip + pip install requests + pip install rdflib + - name: Install dependencies + uses: ./qlever-code/.github/workflows/install-dependencies-ubuntu + - name: Install compiler + uses: ./qlever-code/.github/workflows/install-compiler-ubuntu + with: + compiler: "clang" + compiler-version: "16" + - name: Create build directory + run: mkdir ${{github.workspace}}/qlever-code/build + - name: Configure CMake + run: cmake -S ${{github.workspace}}/qlever-code/ -B ${{github.workspace}}/qlever-code/build ${{env.cmake-flags}} -DCMAKE_BUILD_TYPE=${{env.build-type}} -DLOGLEVEL=INFO -DUSE_PARALLEL=false + - name: Build + run: cmake --build ${{github.workspace}}/qlever-code/build --config ${{env.build-type}} -- -j $(nproc) + - name: Execute test suite + run: | + cd qlever-test-suite + python testsuite.py config http://0.0.0.0 7001 ${{github.workspace}}/sparql-test-suite/sparql/sparql11/ ${{github.workspace}}/qlever-code/build/ localhost sparql sparql + python testsuite.py extract + python testsuite.py ${{ github.sha }} + cd .. + # Only upload directly if this is not a pull request. In this + # case we are on the master branch and have access to the token. + - name: "Submit data to server" + if: github.event_name != 'pull_request' + env: + SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} + API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} + run: | + curl -H "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/qlever-test-suite/results/${{ github.sha }}.json.bz2" $SERVER_URL/upload + + # For a pull request we store the file as well as some information + # about this PR (number, how to check it out, etc.) and upload it as an artifact + - name: Save PR number and coverage file in same directory + if: github.event_name == 'pull_request' + # Note: If you change any of the filenames here, you also have to change them in `upload-coverage.yml` + run : | + mkdir -p conformance-report + echo ${{ github.event.number }} > ./conformance-report/pr + echo ${{ github.repository }} > ./conformance-report/github_repository + echo ${GITHUB_REF} > ./conformance-report/github_ref + echo ${{github.event.pull_request.head.sha}} > ./conformance-report/sha + mv ${{ github.workspace}}/qlever-test-suite/results/${{ github.sha }}.json.bz2 conformance-report/${{ github.event.pull_request.head.sha }}.json.bz2 + - name: Upload coverage artifact + if: github.event_name == 'pull_request' + uses: actions/upload-artifact@v3 + with: + name: conformance-report + path: conformance-report/ \ No newline at end of file diff --git a/.github/workflows/upload-sparql-conformance.yml b/.github/workflows/upload-sparql-conformance.yml new file mode 100644 index 0000000000..fe9be1580c --- /dev/null +++ b/.github/workflows/upload-sparql-conformance.yml @@ -0,0 +1,61 @@ +name: Upload conformance tests result + +on: + workflow_run: + # This has to be the `name:` of the workflow in `code_coverage.yml`. + # Start when this workflow has finished successfully. + workflows: [sparql-test-suite] + types: + - completed + +jobs: + upload: + runs-on: ubuntu-latest + if: > + github.event.workflow_run.event == 'pull_request' && + github.event.workflow_run.conclusion == 'success' + steps: + - name: 'Download artifact' + uses: actions/github-script@v6 + # The following script is taken from the link stated at the + # beginning of this file. It manually downloads an artifact + # from another workflow. + with: + script: | + var artifacts = await github.rest.actions.listWorkflowRunArtifacts({ + owner: context.repo.owner, + repo: context.repo.repo, + run_id: ${{github.event.workflow_run.id }}, + }); + var matchArtifact = artifacts.data.artifacts.filter((artifact) => { + return artifact.name == "conformance-report" + })[0]; + var download = await github.rest.actions.downloadArtifact({ + owner: context.repo.owner, + repo: context.repo.repo, + artifact_id: matchArtifact.id, + archive_format: 'zip', + }); + var fs = require('fs'); + fs.writeFileSync('${{github.workspace}}/conformance-report.zip', Buffer.from(download.data)); + - run: unzip conformance-report.zip + # Read the metadata into environment variables. + - name: "Read PR number" + run: echo "pr_number=`cat pr`" >> $GITHUB_ENV + - name: "Read Github Ref" + run: echo "original_github_ref=`cat github_ref`" >> $GITHUB_ENV; + - name: "Read Github SHA" + run: echo "commit_sha=`cat sha`" >> $GITHUB_ENV; + - name: "Read Github Repository" + run: echo "original_github_repository=`cat github_repository`" >> $GITHUB_ENV; + # We have to check out the source code from the PR, otherwise Codecov + # won't process the upload properly. We first check it out into a + # subdirectory `qlever-source`, otherwise the coverage report will + # be overwritten. We then move all the files back into the working + # directory such that Codecov will pick them up properly. + - name: "Submit data to server" + env: + SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} + API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} + run: | + curl -H "x-api-key: $API_KEY" -H "event: ${{github.event.workflow_run.event}}" -H "sha: ${{env.commit_sha}}" -H "pr-number: ${{env.pr_number}}" -H "repo: ${{env.original_github_repository}}" -F "file=@${{env.commit_sha}}.json.bz2" $SERVER_URL/upload \ No newline at end of file From 3205fa2da872a5fb861669009e5d0f4b7794c1d2 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 8 Oct 2024 15:13:37 +0200 Subject: [PATCH 02/30] A dummy file for the workflow run thingy. Signed-off-by: Johannes Kalmbach --- .github/workflows/sparql-conformance.yml | 50 ++----------------- .../workflows/upload-sparql-conformance.yml | 2 +- 2 files changed, 5 insertions(+), 47 deletions(-) diff --git a/.github/workflows/sparql-conformance.yml b/.github/workflows/sparql-conformance.yml index 53c747f49c..cfb2552670 100644 --- a/.github/workflows/sparql-conformance.yml +++ b/.github/workflows/sparql-conformance.yml @@ -17,50 +17,8 @@ jobs: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 - with: - submodules: "recursive" - path: qlever-code - - name: Checkout sparql-test-suite-files - uses: actions/checkout@v3 - with: - repository: "w3c/rdf-tests" - path: sparql-test-suite - - name: Checkout qlever-test-suite - uses: actions/checkout@v3 - with: - repository: "SIRDNARch/qlever-conformance-tests" - token: ${{ secrets.CONFORMANCE_REPO_ACCESS_TOKEN }} - path: qlever-test-suite - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: "3.10" - - name: Install python dependencies - run: | - python -m pip install --upgrade pip - pip install requests - pip install rdflib - - name: Install dependencies - uses: ./qlever-code/.github/workflows/install-dependencies-ubuntu - - name: Install compiler - uses: ./qlever-code/.github/workflows/install-compiler-ubuntu - with: - compiler: "clang" - compiler-version: "16" - - name: Create build directory - run: mkdir ${{github.workspace}}/qlever-code/build - - name: Configure CMake - run: cmake -S ${{github.workspace}}/qlever-code/ -B ${{github.workspace}}/qlever-code/build ${{env.cmake-flags}} -DCMAKE_BUILD_TYPE=${{env.build-type}} -DLOGLEVEL=INFO -DUSE_PARALLEL=false - - name: Build - run: cmake --build ${{github.workspace}}/qlever-code/build --config ${{env.build-type}} -- -j $(nproc) - - name: Execute test suite - run: | - cd qlever-test-suite - python testsuite.py config http://0.0.0.0 7001 ${{github.workspace}}/sparql-test-suite/sparql/sparql11/ ${{github.workspace}}/qlever-code/build/ localhost sparql sparql - python testsuite.py extract - python testsuite.py ${{ github.sha }} - cd .. + - name: "spoof a json file for experimenting" + run: echo "{}" > dummyResults.json # Only upload directly if this is not a pull request. In this # case we are on the master branch and have access to the token. - name: "Submit data to server" @@ -69,7 +27,7 @@ jobs: SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} run: | - curl -H "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/qlever-test-suite/results/${{ github.sha }}.json.bz2" $SERVER_URL/upload + curl -H "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/dummyResults.json" $SERVER_URL/upload # For a pull request we store the file as well as some information # about this PR (number, how to check it out, etc.) and upload it as an artifact @@ -82,7 +40,7 @@ jobs: echo ${{ github.repository }} > ./conformance-report/github_repository echo ${GITHUB_REF} > ./conformance-report/github_ref echo ${{github.event.pull_request.head.sha}} > ./conformance-report/sha - mv ${{ github.workspace}}/qlever-test-suite/results/${{ github.sha }}.json.bz2 conformance-report/${{ github.event.pull_request.head.sha }}.json.bz2 + mv ${{ github.workspace}}/dummyResults.json conformance-report/${{ github.event.pull_request.head.sha }}.json - name: Upload coverage artifact if: github.event_name == 'pull_request' uses: actions/upload-artifact@v3 diff --git a/.github/workflows/upload-sparql-conformance.yml b/.github/workflows/upload-sparql-conformance.yml index fe9be1580c..3c249b7eee 100644 --- a/.github/workflows/upload-sparql-conformance.yml +++ b/.github/workflows/upload-sparql-conformance.yml @@ -58,4 +58,4 @@ jobs: SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} run: | - curl -H "x-api-key: $API_KEY" -H "event: ${{github.event.workflow_run.event}}" -H "sha: ${{env.commit_sha}}" -H "pr-number: ${{env.pr_number}}" -H "repo: ${{env.original_github_repository}}" -F "file=@${{env.commit_sha}}.json.bz2" $SERVER_URL/upload \ No newline at end of file + curl -H "x-api-key: $API_KEY" -H "event: ${{github.event.workflow_run.event}}" -H "sha: ${{env.commit_sha}}" -H "pr-number: ${{env.pr_number}}" -H "repo: ${{env.original_github_repository}}" -F "file=@${{env.commit_sha}}.json" $SERVER_URL/upload \ No newline at end of file From b7bedba82609b89570fa730964c25011617f2970 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 8 Oct 2024 15:19:59 +0200 Subject: [PATCH 03/30] Another test... Signed-off-by: Johannes Kalmbach --- .github/workflows/sparql-conformance.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/sparql-conformance.yml b/.github/workflows/sparql-conformance.yml index cfb2552670..528abce449 100644 --- a/.github/workflows/sparql-conformance.yml +++ b/.github/workflows/sparql-conformance.yml @@ -27,6 +27,7 @@ jobs: SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} run: | + echo "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/dummyResults.json" $SERVER_URL/upload curl -H "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/dummyResults.json" $SERVER_URL/upload # For a pull request we store the file as well as some information From 8bd62997d2d06740180fc8e11ce0737732b79132 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 8 Oct 2024 16:03:46 +0200 Subject: [PATCH 04/30] More sparql conformance stuff... Signed-off-by: Johannes Kalmbach --- .github/workflows/sparql-conformance.yml | 51 +++++++++++++++++-- .../workflows/upload-sparql-conformance.yml | 2 +- 2 files changed, 47 insertions(+), 6 deletions(-) diff --git a/.github/workflows/sparql-conformance.yml b/.github/workflows/sparql-conformance.yml index 528abce449..160de04ca1 100644 --- a/.github/workflows/sparql-conformance.yml +++ b/.github/workflows/sparql-conformance.yml @@ -17,8 +17,50 @@ jobs: runs-on: ubuntu-22.04 steps: - - name: "spoof a json file for experimenting" - run: echo "{}" > dummyResults.json + - name: Checkout qlever-test-suite + uses: actions/checkout@v3 + with: + repository: "SIRDNARch/qlever-conformance-tests" + token: ${{ secrets.CONFORMANCE_REPO_ACCESS_TOKEN }} + path: qlever-test-suite + - uses: actions/checkout@v3 + with: + submodules: "recursive" + path: qlever-code + - name: Checkout sparql-test-suite-files + uses: actions/checkout@v3 + with: + repository: "w3c/rdf-tests" + path: sparql-test-suite + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: Install python dependencies + run: | + python -m pip install --upgrade pip + pip install requests + pip install rdflib + - name: Install dependencies + uses: ./qlever-code/.github/workflows/install-dependencies-ubuntu + - name: Install compiler + uses: ./qlever-code/.github/workflows/install-compiler-ubuntu + with: + compiler: "clang" + compiler-version: "16" + - name: Create build directory + run: mkdir ${{github.workspace}}/qlever-code/build + - name: Configure CMake + run: cmake -S ${{github.workspace}}/qlever-code/ -B ${{github.workspace}}/qlever-code/build ${{env.cmake-flags}} -DCMAKE_BUILD_TYPE=${{env.build-type}} -DLOGLEVEL=INFO -DUSE_PARALLEL=false + - name: Build + run: cmake --build ${{github.workspace}}/qlever-code/build --config ${{env.build-type}} -- -j $(nproc) + - name: Execute test suite + run: | + cd qlever-test-suite + python testsuite.py config http://0.0.0.0 7001 ${{github.workspace}}/sparql-test-suite/sparql/sparql11/ ${{github.workspace}}/qlever-code/build/ localhost sparql sparql + python testsuite.py extract + python testsuite.py ${{ github.sha }} + cd .. # Only upload directly if this is not a pull request. In this # case we are on the master branch and have access to the token. - name: "Submit data to server" @@ -27,8 +69,7 @@ jobs: SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} run: | - echo "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/dummyResults.json" $SERVER_URL/upload - curl -H "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/dummyResults.json" $SERVER_URL/upload + curl -H "x-api-key: $API_KEY" -H "event: ${{github.event_name}}" -H "sha: ${{github.sha}}" -F "file=@${{github.workspace}}/qlever-test-suite/results/${{ github.sha }}.json.bz2" $SERVER_URL/upload # For a pull request we store the file as well as some information # about this PR (number, how to check it out, etc.) and upload it as an artifact @@ -41,7 +82,7 @@ jobs: echo ${{ github.repository }} > ./conformance-report/github_repository echo ${GITHUB_REF} > ./conformance-report/github_ref echo ${{github.event.pull_request.head.sha}} > ./conformance-report/sha - mv ${{ github.workspace}}/dummyResults.json conformance-report/${{ github.event.pull_request.head.sha }}.json + mv ${{ github.workspace}}/qlever-test-suite/results/${{ github.sha }}.json.bz2 conformance-report/${{ github.event.pull_request.head.sha }}.json.bz2 - name: Upload coverage artifact if: github.event_name == 'pull_request' uses: actions/upload-artifact@v3 diff --git a/.github/workflows/upload-sparql-conformance.yml b/.github/workflows/upload-sparql-conformance.yml index 3c249b7eee..fe9be1580c 100644 --- a/.github/workflows/upload-sparql-conformance.yml +++ b/.github/workflows/upload-sparql-conformance.yml @@ -58,4 +58,4 @@ jobs: SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} run: | - curl -H "x-api-key: $API_KEY" -H "event: ${{github.event.workflow_run.event}}" -H "sha: ${{env.commit_sha}}" -H "pr-number: ${{env.pr_number}}" -H "repo: ${{env.original_github_repository}}" -F "file=@${{env.commit_sha}}.json" $SERVER_URL/upload \ No newline at end of file + curl -H "x-api-key: $API_KEY" -H "event: ${{github.event.workflow_run.event}}" -H "sha: ${{env.commit_sha}}" -H "pr-number: ${{env.pr_number}}" -H "repo: ${{env.original_github_repository}}" -F "file=@${{env.commit_sha}}.json.bz2" $SERVER_URL/upload \ No newline at end of file From 5cb6a0e048e0575c2d6e10f702ed24cf56d3a272 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 12:35:36 +0100 Subject: [PATCH 05/30] Backup in the middle. Signed-off-by: Johannes Kalmbach --- src/engine/CMakeLists.txt | 2 +- src/engine/ExistsScan.cpp | 118 ++++++++++++++++++ src/engine/ExistsScan.h | 55 ++++++++ src/engine/GroupBy.cpp | 22 +--- .../sparqlExpressions/ExistsExpression.cpp | 5 + .../sparqlExpressions/ExistsExpression.h | 39 ++++++ .../sparqlExpressions/SparqlExpression.cpp | 14 +++ .../sparqlExpressions/SparqlExpression.h | 10 ++ 8 files changed, 248 insertions(+), 17 deletions(-) create mode 100644 src/engine/ExistsScan.cpp create mode 100644 src/engine/ExistsScan.h create mode 100644 src/engine/sparqlExpressions/ExistsExpression.cpp create mode 100644 src/engine/sparqlExpressions/ExistsExpression.h diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index be22a64d5d..c724a8fb39 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,5 +14,5 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp - Describe.cpp) + Describe.cpp ExistsScan.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp new file mode 100644 index 0000000000..f42da68f3d --- /dev/null +++ b/src/engine/ExistsScan.cpp @@ -0,0 +1,118 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#include "engine/ExistsScan.h" + +#include "util/JoinAlgorithms/JoinAlgorithms.h" + +// _____________________________________________________________________________ +ExistsScan::ExistsScan(QueryExecutionContext* qec, + std::shared_ptr left, + std::shared_ptr right, + Variable existsVariable) + : Operation{qec}, + left_{std::move(left)}, + right_{std::move(right)}, + existsVariable_{std::move(existsVariable)}, + joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)} {} + +// _____________________________________________________________________________ +string ExistsScan::getCacheKeyImpl() const { + return absl::StrCat("EXISTS SCAN left: ", left_->getCacheKey(), + " right: ", right_->getCacheKey()); +} + +// _____________________________________________________________________________ +string ExistsScan::getDescriptor() const { return "EXISTS scan"; } + +// ____________________________________________________________________________ +VariableToColumnMap ExistsScan::computeVariableToColumnMap() const { + auto res = left_->getVariableColumns(); + AD_CONTRACT_CHECK( + !res.contains(existsVariable_), + "The target variable of an exists scan must be a new variable"); + res[existsVariable_] = makeAlwaysDefinedColumn(getResultWidth() - 1); + return res; +} + +// ____________________________________________________________________________ +size_t ExistsScan::getResultWidth() const { + // We add one column to the input. + return left_->getResultWidth() + 1; +} + +// ____________________________________________________________________________ +vector ExistsScan::resultSortedOn() const { + return left_->resultSortedOn(); +} + +// ____________________________________________________________________________ +float ExistsScan::getMultiplicity(size_t col) { + if (col < getResultWidth() - 1) { + return left_->getMultiplicity(col); + } + // The multiplicity of the boolean column can be a dummy value, as it should + // be never used for joins etc. + return 1; +} + +// ____________________________________________________________________________ +uint64_t ExistsScan::getSizeEstimateBeforeLimit() { + return left_->getSizeEstimate(); +} + +// ____________________________________________________________________________ +size_t ExistsScan::getCostEstimate() { + return left_->getCostEstimate() + right_->getCostEstimate() + + left_->getSizeEstimate() + right_->getSizeEstimate(); +} + +// ____________________________________________________________________________ +ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { + auto leftRes = left_->getResult(); + auto rightRes = right_->getResult(); + const auto& left = leftRes->idTable(); + const auto& right = rightRes->idTable(); + + ad_utility::JoinColumnMapping joinColumnData{joinColumns_, left.numColumns(), + right.numColumns()}; + + IdTableView<0> joinColumnsLeft = + left.asColumnSubsetView(joinColumnData.jcsLeft()); + IdTableView<0> joinColumnsRight = + right.asColumnSubsetView(joinColumnData.jcsRight()); + + checkCancellation(); + + auto noopRowAdder = [](auto&&...) {}; + + // TODO Memory limit. + std::vector notExistsIndices; + auto actionForNotExisting = + [¬ExistsIndices, begin = joinColumnsLeft.begin()](const auto& itLeft) { + notExistsIndices.push_back(itLeft - begin); + }; + + // TODO Handle UNDEF values correctly (and efficiently) + auto findUndefDispatch = [](const auto& row, It begin, auto end, + bool& outOfOrder) { + return std::array{}; + }; + + auto checkCancellationLambda = [this] { checkCancellation(); }; + [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( + joinColumnsLeft, joinColumnsRight, ql::ranges::lexicographical_compare, + noopRowAdder, findUndefDispatch, findUndefDispatch, actionForNotExisting, + checkCancellationLambda); + + // Set up the result; + IdTable result = left.clone(); + result.addEmptyColumn(); + decltype(auto) existsCol = result.getColumn(getResultWidth() - 1); + ql::ranges::fill(existsCol, Id::makeFromBool(true)); + for (size_t notExistsIndex : notExistsIndices) { + existsCol[notExistsIndex] = Id::makeFromBool(false); + } + return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; +} diff --git a/src/engine/ExistsScan.h b/src/engine/ExistsScan.h new file mode 100644 index 0000000000..b08e06c542 --- /dev/null +++ b/src/engine/ExistsScan.h @@ -0,0 +1,55 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#pragma once + +#include "engine/Operation.h" +#include "engine/QueryExecutionTree.h" + +class ExistsScan : public Operation { + private: + std::shared_ptr left_; + std::shared_ptr right_; + std::vector> joinColumns_; + + Variable existsVariable_; + + vector _multiplicities; + std::vector> _matchedColumns; + + public: + ExistsScan(QueryExecutionContext* qec, + std::shared_ptr left, + std::shared_ptr right, + Variable existsVariable); + + protected: + string getCacheKeyImpl() const override; + + public: + string getDescriptor() const override; + + size_t getResultWidth() const override; + + vector resultSortedOn() const override; + + bool knownEmptyResult() override { return left_->knownEmptyResult(); } + + float getMultiplicity(size_t col) override; + + private: + uint64_t getSizeEstimateBeforeLimit() override; + + public: + size_t getCostEstimate() override; + + vector getChildren() override { + return {left_.get(), right_.get()}; + } + + private: + ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override; + + VariableToColumnMap computeVariableToColumnMap() const override; +}; diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 6fdeca1833..a6ff49bbe1 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -366,8 +366,6 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { } if (useHashMapOptimization) { - // Helper lambda that calls `computeGroupByForHashMapOptimization` for the - // given `subresults`. auto computeWithHashMap = [this, &metadataForUnsequentialData, &groupByCols](auto&& subresults) { auto doCompute = [&] { @@ -378,10 +376,9 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { return ad_utility::callFixedSize(groupByCols.size(), doCompute); }; - // Now call `computeWithHashMap` and return the result. It expects a range - // of results, so if the result is fully materialized, we create an array - // with a single element. if (subresult->isFullyMaterialized()) { + // `computeWithHashMap` takes a range, so we artificially create one with + // a single input. return computeWithHashMap( std::array{std::pair{std::cref(subresult->idTable()), std::cref(subresult->localVocab())}}); @@ -1509,36 +1506,29 @@ Result GroupBy::computeGroupByForHashMapOptimization( NUM_GROUP_COLUMNS == 0); LocalVocab localVocab; - // Initialize the data for the aggregates of the GROUP BY operation. + // Initialize aggregation data HashMapAggregationData aggregationData( getExecutionContext()->getAllocator(), aggregateAliases, columnIndices.size()); - // Process the input blocks (pairs of `IdTable` and `LocalVocab`) one after - // the other. ad_utility::Timer lookupTimer{ad_utility::Timer::Stopped}; ad_utility::Timer aggregationTimer{ad_utility::Timer::Stopped}; for (const auto& [inputTableRef, inputLocalVocabRef] : subresults) { + // Also support `std::reference_wrapper` as the input. const IdTable& inputTable = inputTableRef; const LocalVocab& inputLocalVocab = inputLocalVocabRef; - // Merge the local vocab of each input block. - // - // NOTE: If the input blocks have very similar or even identical non-empty - // local vocabs, no deduplication is performed. localVocab.mergeWith(std::span{&inputLocalVocab, 1}); - - // Setup the `EvaluationContext` for this input block. + // Initialize evaluation context sparqlExpression::EvaluationContext evaluationContext( *getExecutionContext(), _subtree->getVariableColumns(), inputTable, getExecutionContext()->getAllocator(), localVocab, cancellationHandle_, deadline_); + evaluationContext._groupedVariables = ad_utility::HashSet{ _groupByVariables.begin(), _groupByVariables.end()}; evaluationContext._isPartOfGroupBy = true; - // Iterate of the rows of this input block. Process (up to) - // `GROUP_BY_HASH_MAP_BLOCK_SIZE` rows at a time. for (size_t i = 0; i < inputTable.size(); i += GROUP_BY_HASH_MAP_BLOCK_SIZE) { checkCancellation(); diff --git a/src/engine/sparqlExpressions/ExistsExpression.cpp b/src/engine/sparqlExpressions/ExistsExpression.cpp new file mode 100644 index 0000000000..6737d3ed7b --- /dev/null +++ b/src/engine/sparqlExpressions/ExistsExpression.cpp @@ -0,0 +1,5 @@ +// +// Created by kalmbacj on 1/7/25. +// + +#include "ExistsExpression.h" diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h new file mode 100644 index 0000000000..5ec68acd61 --- /dev/null +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -0,0 +1,39 @@ +// +// Created by kalmbacj on 1/7/25. +// + +#pragma once + +#include + +#include "engine/sparqlExpressions/SparqlExpression.h" +#include "parser/ParsedQuery.h" + +namespace sparqlExpression { +class ExistsExpression : public SparqlExpression { + private: + std::variant argument_; + + public: + auto& argument() { return argument_; } + ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} + + ExpressionResult evaluate(EvaluationContext* context) const override { + AD_CONTRACT_CHECK(std::holds_alternative(argument_)); + return std::get(argument_); + } + + //_________________________________________________________________________ + [[nodiscard]] string getCacheKey( + const VariableToColumnMap& varColMap) const override { + // TODO get a proper cache key here + AD_CONTRACT_CHECK(std::holds_alternative(argument_)); + return absl::StrCat( + "EXISTS WITH COLUMN ", + varColMap.at(std::get(argument_)).columnIndex_); + } + + private: + std::span childrenImpl() override { return {}; } +}; +} // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/SparqlExpression.cpp b/src/engine/sparqlExpressions/SparqlExpression.cpp index b5ec3aa0f7..00864b998d 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.cpp +++ b/src/engine/sparqlExpressions/SparqlExpression.cpp @@ -180,4 +180,18 @@ bool SparqlExpression::isInsideAggregate() const { } return isInsideAggregate_; } + +// ________________________________________________________________ +bool SparqlExpression::isExistsExpression() const { return false; } + +// ________________________________________________________________ +void SparqlExpression::getExistsExpressions( + std::vector& result) { + if (isExistsExpression()) { + result.push_back(this); + } + for (auto& child : children()) { + child->getExistsExpressions(result); + } +} } // namespace sparqlExpression diff --git a/src/engine/sparqlExpressions/SparqlExpression.h b/src/engine/sparqlExpressions/SparqlExpression.h index 1378f10520..d5f7248daf 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.h +++ b/src/engine/sparqlExpressions/SparqlExpression.h @@ -123,6 +123,16 @@ class SparqlExpression { // implementation returns `false`. virtual bool isStrExpression() const; + // Returns true iff this expression is an EXISTS(...) expression. Default + // implementation returns `false`. + virtual bool isExistsExpression() const; + + // Return non-null pointers to all `EXISTS` expressions in the subtree. + // The result is passed in as a reference to simplify the recursive + // implementation. + virtual void getExistsExpressions( + std::vector& result) final; + // __________________________________________________________________________ virtual ~SparqlExpression() = default; From e356ee1c831d00aeb74ac17094cffcec1f4b55d1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 12:57:41 +0100 Subject: [PATCH 06/30] Add some parsing and add some thoughts. Signed-off-by: Johannes Kalmbach --- .../sparqlParser/SparqlQleverVisitor.cpp | 18 +++++++++++++++--- src/parser/sparqlParser/SparqlQleverVisitor.h | 4 ++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index f23530f820..99a943d350 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -15,6 +15,7 @@ #include "absl/time/time.h" #include "engine/sparqlExpressions/CountStarExpression.h" +#include "engine/sparqlExpressions/ExistsExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" #include "engine/sparqlExpressions/LiteralExpression.h" #include "engine/sparqlExpressions/NaryExpression.h" @@ -1366,6 +1367,7 @@ SparqlFilter Visitor::visit(Parser::FilterRContext* ctx) { // expression contains unbound variables, because the variables of the FILTER // might be bound after the filter appears in the query (which is perfectly // legal). + auto pimpl = visitExpressionPimpl(ctx->constraint()); return SparqlFilter{visitExpressionPimpl(ctx->constraint())}; } @@ -2229,6 +2231,10 @@ ExpressionPtr Visitor::visit([[maybe_unused]] Parser::BuiltInCallContext* ctx) { return visit(ctx->substringExpression()); } else if (ctx->strReplaceExpression()) { return visit(ctx->strReplaceExpression()); + } else if (ctx->existsFunc()) { + return visit(ctx->existsFunc()); + } else if (ctx->notExistsFunc()) { + return visit(ctx->notExistsFunc()); } // Get the function name and the arguments. Note that we do not have to check // the number of arguments like for `processIriFunctionCall`, since the number @@ -2418,12 +2424,18 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { } // ____________________________________________________________________________________ -void Visitor::visit(const Parser::ExistsFuncContext* ctx) { - reportNotSupported(ctx, "The EXISTS function is"); +ExpressionPtr Visitor::visit(Parser::ExistsFuncContext* ctx) { + auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); + auto group = visit(ctx->groupGraphPattern()); + ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); + query.selectClause().setAsterisk(); + query._rootGraphPattern = std::move(group); + return std::make_unique(std::move(query)); } // ____________________________________________________________________________________ -void Visitor::visit(const Parser::NotExistsFuncContext* ctx) { +ExpressionPtr Visitor::visit(Parser::NotExistsFuncContext* ctx) { + // TODO Implement this without duplicating the code for EXISTS. reportNotSupported(ctx, "The NOT EXISTS function is"); } diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index fb1cb9c05c..5fb4c95a08 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -444,9 +444,9 @@ class SparqlQleverVisitor { ExpressionPtr visit(Parser::StrReplaceExpressionContext* ctx); - [[noreturn]] static void visit(const Parser::ExistsFuncContext* ctx); + ExpressionPtr visit(Parser::ExistsFuncContext* ctx); - [[noreturn]] static void visit(const Parser::NotExistsFuncContext* ctx); + ExpressionPtr visit(Parser::NotExistsFuncContext* ctx); ExpressionPtr visit(Parser::AggregateContext* ctx); From fc2017479677c4bc9a04a4f5d5259c3fe1d6d0de Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 15:49:29 +0100 Subject: [PATCH 07/30] Also implement NOT EXISTS Signed-off-by: Johannes Kalmbach --- src/engine/ExistsScan.cpp | 6 +++-- src/engine/Filter.cpp | 18 +++++++++++++ src/engine/QueryExecutionTree.h | 3 +++ .../sparqlExpressions/ExistsExpression.h | 24 +++++++++++------- .../sparqlParser/SparqlQleverVisitor.cpp | 25 +++++++++++++++---- src/parser/sparqlParser/SparqlQleverVisitor.h | 5 ++++ 6 files changed, 65 insertions(+), 16 deletions(-) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index f42da68f3d..1604e353ad 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -95,8 +95,10 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { }; // TODO Handle UNDEF values correctly (and efficiently) - auto findUndefDispatch = [](const auto& row, It begin, auto end, - bool& outOfOrder) { + auto findUndefDispatch = []([[maybe_unused]] const auto& row, + [[maybe_unused]] It begin, + [[maybe_unused]] auto end, + [[maybe_unused]] bool& outOfOrder) { return std::array{}; }; diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index 9ecdd85f7a..519c0d9da5 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -10,10 +10,13 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" +#include "engine/ExistsScan.h" #include "engine/QueryExecutionTree.h" +#include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "engine/sparqlExpressions/SparqlExpressionValueGetters.h" +#include "sparqlExpressions/ExistsExpression.h" using std::endl; using std::string; @@ -28,6 +31,21 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { + std::vector existsExpressions; + _expression.getPimpl()->getExistsExpressions(existsExpressions); + for (auto* expr : existsExpressions) { + const auto& exists = + dynamic_cast(*expr); + QueryPlanner qp{getExecutionContext(), cancellationHandle_}; + // TODO This can be done by the expression itself, then it is + // automatically duplicated. + auto pq = exists.argument(); + auto tree = + std::make_shared(qp.createExecutionTree(pq)); + _subtree = ad_utility::makeExecutionTree( + getExecutionContext(), std::move(_subtree), std::move(tree), + exists.variable()); + } setPrefilterExpressionForChildren(); } diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 0eac785f16..3c074d6c47 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -25,7 +25,10 @@ class QueryExecutionTree { std::shared_ptr operation) : QueryExecutionTree(qec) { rootOperation_ = std::move(operation); + // TODO This currently fails for EXISTS but it is also unneeded. + /* readFromCache(); + */ } std::string getCacheKey() const; diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index 5ec68acd61..d5eff23ba8 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -12,27 +12,33 @@ namespace sparqlExpression { class ExistsExpression : public SparqlExpression { private: - std::variant argument_; + ParsedQuery argument_; + static inline std::atomic indexCounter_ = 0; + size_t index_ = ++indexCounter_; + Variable variable_{absl::StrCat("?ql_internal_exists_", index_)}; public: - auto& argument() { return argument_; } + const auto& argument() const { return argument_; } + const auto& variable() const { return variable_; } ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} ExpressionResult evaluate(EvaluationContext* context) const override { - AD_CONTRACT_CHECK(std::holds_alternative(argument_)); - return std::get(argument_); + AD_CONTRACT_CHECK(context->_variableToColumnMap.contains(variable_)); + return variable_; } - //_________________________________________________________________________ + //____________________________________________________________________________ [[nodiscard]] string getCacheKey( const VariableToColumnMap& varColMap) const override { // TODO get a proper cache key here - AD_CONTRACT_CHECK(std::holds_alternative(argument_)); - return absl::StrCat( - "EXISTS WITH COLUMN ", - varColMap.at(std::get(argument_)).columnIndex_); + AD_CONTRACT_CHECK(varColMap.contains(variable_)); + return absl::StrCat("EXISTS WITH COL ", + varColMap.at(variable_).columnIndex_); } + // ____________________________________________________________________________ + bool isExistsExpression() const override { return true; } + private: std::span childrenImpl() override { return {}; } }; diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 99a943d350..903544c96a 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -26,6 +26,7 @@ #include "engine/sparqlExpressions/SampleExpression.h" #include "engine/sparqlExpressions/StdevExpression.h" #include "engine/sparqlExpressions/UuidExpressions.h" +#include "generated/SparqlAutomaticParser.h" #include "global/Constants.h" #include "global/RuntimeParameters.h" #include "parser/GraphPatternOperation.h" @@ -2424,19 +2425,33 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { } // ____________________________________________________________________________________ -ExpressionPtr Visitor::visit(Parser::ExistsFuncContext* ctx) { +ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, + bool negate) { auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); - auto group = visit(ctx->groupGraphPattern()); + auto visibleVariablesSoFar = std::move(visibleVariables_); + visibleVariables_.clear(); + auto group = visit(pattern); ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); query.selectClause().setAsterisk(); query._rootGraphPattern = std::move(group); - return std::make_unique(std::move(query)); + visibleVariables_ = std::move(visibleVariablesSoFar); + auto exists = + std::make_unique(std::move(query)); + if (negate) { + return sparqlExpression::makeUnaryNegateExpression(std::move(exists)); + } else { + return exists; + } +} + +// ____________________________________________________________________________________ +ExpressionPtr Visitor::visit(Parser::ExistsFuncContext* ctx) { + return visitExists(ctx->groupGraphPattern(), false); } // ____________________________________________________________________________________ ExpressionPtr Visitor::visit(Parser::NotExistsFuncContext* ctx) { - // TODO Implement this without duplicating the code for EXISTS. - reportNotSupported(ctx, "The NOT EXISTS function is"); + return visitExists(ctx->groupGraphPattern(), true); } // ____________________________________________________________________________________ diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 5fb4c95a08..3e7b63c3ad 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -444,6 +444,11 @@ class SparqlQleverVisitor { ExpressionPtr visit(Parser::StrReplaceExpressionContext* ctx); + // The common implementation of the parsing of `EXISTS` and `NOT EXISTS`. + // The second argument is `true` for `NOT EXISTS`. + ExpressionPtr visitExists(Parser::GroupGraphPatternContext* pattern, + bool negate); + ExpressionPtr visit(Parser::ExistsFuncContext* ctx); ExpressionPtr visit(Parser::NotExistsFuncContext* ctx); From dde296b052dee3c267acdeaec514a3e3b47e5cb9 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Tue, 7 Jan 2025 16:27:14 +0100 Subject: [PATCH 08/30] Fix a small warning, to feed this to the tool. Signed-off-by: Johannes Kalmbach --- src/engine/ExistsScan.cpp | 4 ++-- src/engine/sparqlExpressions/ExistsExpression.h | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index 1604e353ad..651e8e61cb 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -14,8 +14,8 @@ ExistsScan::ExistsScan(QueryExecutionContext* qec, : Operation{qec}, left_{std::move(left)}, right_{std::move(right)}, - existsVariable_{std::move(existsVariable)}, - joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)} {} + joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)}, + existsVariable_{std::move(existsVariable)} {} // _____________________________________________________________________________ string ExistsScan::getCacheKeyImpl() const { diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index d5eff23ba8..343c195e82 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -30,10 +30,15 @@ class ExistsExpression : public SparqlExpression { //____________________________________________________________________________ [[nodiscard]] string getCacheKey( const VariableToColumnMap& varColMap) const override { - // TODO get a proper cache key here - AD_CONTRACT_CHECK(varColMap.contains(variable_)); - return absl::StrCat("EXISTS WITH COL ", - varColMap.at(variable_).columnIndex_); + if (varColMap.contains(variable_)) { + return absl::StrCat("EXISTS WITH COL ", + varColMap.at(variable_).columnIndex_); + } else { + // This means that the necessary `ExistsScan` hasn't been set up yet. + // It is not possible to cache such incomplete operations, so we return + // a random cache key. + return std::to_string(ad_utility::FastRandomIntGenerator{}()); + } } // ____________________________________________________________________________ From 0d1c788e11f3a2d2b6bb2dfea6bbbc6fba7f1bc3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:07:04 +0100 Subject: [PATCH 09/30] Some cleanups and fixes. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 10 +++ src/engine/Bind.h | 6 +- src/engine/ExistsScan.cpp | 75 +++++++++++++++---- src/engine/ExistsScan.h | 5 ++ src/engine/Filter.cpp | 18 +---- src/engine/GroupBy.cpp | 7 ++ src/engine/MultiColumnJoin.cpp | 14 ++-- src/engine/QueryExecutionTree.h | 3 - .../sparqlExpressions/SparqlExpression.cpp | 2 +- .../sparqlExpressions/SparqlExpression.h | 2 +- src/util/JoinAlgorithms/FindUndefRanges.h | 33 ++++++++ 11 files changed, 128 insertions(+), 47 deletions(-) diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 95de8a4dfe..230ca1cb68 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -5,12 +5,22 @@ #include "Bind.h" #include "engine/CallFixedSize.h" +#include "engine/ExistsScan.h" #include "engine/QueryExecutionTree.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "util/ChunkedForLoop.h" #include "util/Exception.h" +// _____________________________________________________________________________ +Bind::Bind(QueryExecutionContext* qec, + std::shared_ptr subtree, parsedQuery::Bind b) + : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { + _subtree = ExistsScan::addExistsScansToSubtree( + _bind._expression, std::move(subtree), getExecutionContext(), + cancellationHandle_); +} + // BIND adds exactly one new column size_t Bind::getResultWidth() const { return _subtree->getResultWidth() + 1; } diff --git a/src/engine/Bind.h b/src/engine/Bind.h index 34c515fb54..3336e0ddbc 100644 --- a/src/engine/Bind.h +++ b/src/engine/Bind.h @@ -8,14 +8,14 @@ #include "engine/sparqlExpressions/SparqlExpressionPimpl.h" #include "parser/ParsedQuery.h" -/// BIND operation, currently only supports a very limited subset of expressions +// BIND operation. class Bind : public Operation { public: static constexpr size_t CHUNK_SIZE = 10'000; + // ____________________________________________________________________________ Bind(QueryExecutionContext* qec, std::shared_ptr subtree, - parsedQuery::Bind b) - : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) {} + parsedQuery::Bind b); private: std::shared_ptr _subtree; diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index 651e8e61cb..26fde12984 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -4,6 +4,9 @@ #include "engine/ExistsScan.h" +#include "engine/QueryPlanner.h" +#include "engine/sparqlExpressions/ExistsExpression.h" +#include "engine/sparqlExpressions/SparqlExpression.h" #include "util/JoinAlgorithms/JoinAlgorithms.h" // _____________________________________________________________________________ @@ -15,7 +18,10 @@ ExistsScan::ExistsScan(QueryExecutionContext* qec, left_{std::move(left)}, right_{std::move(right)}, joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)}, - existsVariable_{std::move(existsVariable)} {} + existsVariable_{std::move(existsVariable)} { + std::tie(left_, right_) = QueryExecutionTree::createSortedTrees( + std::move(left_), std::move(right_), joinColumns_); +} // _____________________________________________________________________________ string ExistsScan::getCacheKeyImpl() const { @@ -85,28 +91,41 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { checkCancellation(); + // `isCheap` is true iff there are no UNDEF values in the join columns. In + // this case we can use a much cheaper algorithm. + // TODO There are many other cases where a cheaper implementation can + // be chosen, but we leave those for another PR, this is the most common case. + namespace stdr = ql::ranges; + size_t numJoinColumns = joinColumnsLeft.size(); + AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.size()); + bool isCheap = stdr::none_of( + ad_utility::integerRange(numJoinColumns), [&](const auto& col) { + return (stdr::any_of(joinColumnsRight.getColumn(col), + &Id::isUndefined)) || + (stdr::any_of(joinColumnsLeft.getColumn(col), &Id::isUndefined)); + }); + auto noopRowAdder = [](auto&&...) {}; - // TODO Memory limit. - std::vector notExistsIndices; + std::vector> notExistsIndices{ + allocator()}; auto actionForNotExisting = [¬ExistsIndices, begin = joinColumnsLeft.begin()](const auto& itLeft) { notExistsIndices.push_back(itLeft - begin); }; - // TODO Handle UNDEF values correctly (and efficiently) - auto findUndefDispatch = []([[maybe_unused]] const auto& row, - [[maybe_unused]] It begin, - [[maybe_unused]] auto end, - [[maybe_unused]] bool& outOfOrder) { - return std::array{}; - }; - auto checkCancellationLambda = [this] { checkCancellation(); }; - [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( - joinColumnsLeft, joinColumnsRight, ql::ranges::lexicographical_compare, - noopRowAdder, findUndefDispatch, findUndefDispatch, actionForNotExisting, - checkCancellationLambda); + auto runZipperJoin = [&](auto findUndef) { + [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( + joinColumnsLeft, joinColumnsRight, ql::ranges::lexicographical_compare, + noopRowAdder, findUndef, findUndef, actionForNotExisting, + checkCancellationLambda); + }; + if (isCheap) { + runZipperJoin(ad_utility::noop); + } else { + runZipperJoin(ad_utility::findSmallerUndefRanges); + } // Set up the result; IdTable result = left.clone(); @@ -118,3 +137,29 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { } return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; } + +// _____________________________________________________________________________ +std::shared_ptr ExistsScan::addExistsScansToSubtree( + const sparqlExpression::SparqlExpressionPimpl& expression, + std::shared_ptr subtree, QueryExecutionContext* qec, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + std::vector existsExpressions; + expression.getPimpl()->getExistsExpressions(existsExpressions); + for (auto* expr : existsExpressions) { + const auto& exists = + dynamic_cast(*expr); + // Currently some FILTERs are applied multiple times especially when there + // are OPTIONAL joins in the query. In these cases we have to make sure that + // the `ExistsScan` is added only once. + if (subtree->isVariableCovered(exists.variable())) { + continue; + } + QueryPlanner qp{qec, cancellationHandle}; + auto pq = exists.argument(); + auto tree = + std::make_shared(qp.createExecutionTree(pq)); + subtree = ad_utility::makeExecutionTree( + qec, std::move(subtree), std::move(tree), exists.variable()); + } + return subtree; +} diff --git a/src/engine/ExistsScan.h b/src/engine/ExistsScan.h index b08e06c542..dbd947d302 100644 --- a/src/engine/ExistsScan.h +++ b/src/engine/ExistsScan.h @@ -24,6 +24,11 @@ class ExistsScan : public Operation { std::shared_ptr right, Variable existsVariable); + static std::shared_ptr addExistsScansToSubtree( + const sparqlExpression::SparqlExpressionPimpl& expression, + std::shared_ptr subtree, QueryExecutionContext* qec, + const ad_utility::SharedCancellationHandle& cancellationHandle); + protected: string getCacheKeyImpl() const override; diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index 519c0d9da5..ff8edc1fc1 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -31,21 +31,9 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { - std::vector existsExpressions; - _expression.getPimpl()->getExistsExpressions(existsExpressions); - for (auto* expr : existsExpressions) { - const auto& exists = - dynamic_cast(*expr); - QueryPlanner qp{getExecutionContext(), cancellationHandle_}; - // TODO This can be done by the expression itself, then it is - // automatically duplicated. - auto pq = exists.argument(); - auto tree = - std::make_shared(qp.createExecutionTree(pq)); - _subtree = ad_utility::makeExecutionTree( - getExecutionContext(), std::move(_subtree), std::move(tree), - exists.variable()); - } + _subtree = ExistsScan::addExistsScansToSubtree( + _expression, std::move(_subtree), getExecutionContext(), + cancellationHandle_); setPrefilterExpressionForChildren(); } diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index a6ff49bbe1..0fe65fd00e 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -9,6 +9,7 @@ #include #include "engine/CallFixedSize.h" +#include "engine/ExistsScan.h" #include "engine/IndexScan.h" #include "engine/Join.h" #include "engine/LazyGroupBy.h" @@ -52,6 +53,12 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, ql::ranges::sort(_groupByVariables, std::less<>{}, &Variable::name); auto sortColumns = computeSortColumns(subtree.get()); + + for (const auto& alias : _aliases) { + _subtree = ExistsScan::addExistsScansToSubtree( + alias._expression, std::move(subtree), getExecutionContext(), + cancellationHandle_); + } _subtree = QueryExecutionTree::createSortedTree(std::move(subtree), sortColumns); } diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp index bb3e4e5995..b605616ecb 100644 --- a/src/engine/MultiColumnJoin.cpp +++ b/src/engine/MultiColumnJoin.cpp @@ -237,17 +237,11 @@ void MultiColumnJoin::computeMultiColumnJoin( rowAdder.addRow(itLeft - beginLeft, itRight - beginRight); }; - auto findUndef = [](const auto& row, auto begin, auto end, - bool& resultMightBeUnsorted) { - return ad_utility::findSmallerUndefRanges(row, begin, end, - resultMightBeUnsorted); - }; - // `isCheap` is true iff there are no UNDEF values in the join columns. In // this case we can use a much cheaper algorithm. // TODO There are many other cases where a cheaper implementation can // be chosen, but we leave those for another PR, this is the most common case. - namespace stdr = std::ranges; + namespace stdr = ql::ranges; bool isCheap = stdr::none_of(joinColumns, [&](const auto& jcs) { auto [leftCol, rightCol] = jcs; return (stdr::any_of(right.getColumn(rightCol), &Id::isUndefined)) || @@ -265,8 +259,10 @@ void MultiColumnJoin::computeMultiColumnJoin( } else { return ad_utility::zipperJoinWithUndef( leftJoinColumns, rightJoinColumns, - ql::ranges::lexicographical_compare, addRow, findUndef, findUndef, - ad_utility::noop, checkCancellationLambda); + ql::ranges::lexicographical_compare, addRow, + ad_utility::findSmallerUndefRanges, + ad_utility::findSmallerUndefRanges, ad_utility::noop, + checkCancellationLambda); } }(); *result = std::move(rowAdder).resultTable(); diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 3c074d6c47..0eac785f16 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -25,10 +25,7 @@ class QueryExecutionTree { std::shared_ptr operation) : QueryExecutionTree(qec) { rootOperation_ = std::move(operation); - // TODO This currently fails for EXISTS but it is also unneeded. - /* readFromCache(); - */ } std::string getCacheKey() const; diff --git a/src/engine/sparqlExpressions/SparqlExpression.cpp b/src/engine/sparqlExpressions/SparqlExpression.cpp index 00864b998d..099933020f 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.cpp +++ b/src/engine/sparqlExpressions/SparqlExpression.cpp @@ -186,7 +186,7 @@ bool SparqlExpression::isExistsExpression() const { return false; } // ________________________________________________________________ void SparqlExpression::getExistsExpressions( - std::vector& result) { + std::vector& result) const { if (isExistsExpression()) { result.push_back(this); } diff --git a/src/engine/sparqlExpressions/SparqlExpression.h b/src/engine/sparqlExpressions/SparqlExpression.h index d5f7248daf..7f5c551127 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.h +++ b/src/engine/sparqlExpressions/SparqlExpression.h @@ -131,7 +131,7 @@ class SparqlExpression { // The result is passed in as a reference to simplify the recursive // implementation. virtual void getExistsExpressions( - std::vector& result) final; + std::vector& result) const final; // __________________________________________________________________________ virtual ~SparqlExpression() = default; diff --git a/src/util/JoinAlgorithms/FindUndefRanges.h b/src/util/JoinAlgorithms/FindUndefRanges.h index 7b3f3296cb..cbdbc1b4fd 100644 --- a/src/util/JoinAlgorithms/FindUndefRanges.h +++ b/src/util/JoinAlgorithms/FindUndefRanges.h @@ -165,6 +165,38 @@ auto findSmallerUndefRangesArbitrary(const auto& row, It begin, It end, // have additional information about the input (most notably which of the join // columns contain no UNDEF at all) and therefore a more specialized routine // should be chosen. +struct FindSmallerUndefRanges { + template + auto operator()(const auto& row, It begin, It end, + bool& resultMightBeUnsorted) -> cppcoro::generator { + size_t numLastUndefined = 0; + assert(row.size() > 0); + auto it = ql::ranges::rbegin(row); + auto rend = ql::ranges::rend(row); + for (; it < rend; ++it) { + if (*it != Id::makeUndefined()) { + break; + } + ++numLastUndefined; + } + + for (; it < rend; ++it) { + if (*it == Id::makeUndefined()) { + return findSmallerUndefRangesArbitrary(row, begin, end, + resultMightBeUnsorted); + } + } + if (numLastUndefined == 0) { + return findSmallerUndefRangesForRowsWithoutUndef(row, begin, end, + resultMightBeUnsorted); + } else { + return findSmallerUndefRangesForRowsWithUndefInLastColumns( + row, numLastUndefined, begin, end, resultMightBeUnsorted); + } + } +}; +constexpr FindSmallerUndefRanges findSmallerUndefRanges; +/* template auto findSmallerUndefRanges(const auto& row, It begin, It end, bool& resultMightBeUnsorted) @@ -194,4 +226,5 @@ auto findSmallerUndefRanges(const auto& row, It begin, It end, row, numLastUndefined, begin, end, resultMightBeUnsorted); } } +*/ } // namespace ad_utility From 7ff49c97404cd9604bc16fe0e775a61b8b0ef6b3 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:10:10 +0100 Subject: [PATCH 10/30] Fix compilation. Signed-off-by: Johannes Kalmbach --- src/util/JoinAlgorithms/FindUndefRanges.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/JoinAlgorithms/FindUndefRanges.h b/src/util/JoinAlgorithms/FindUndefRanges.h index cbdbc1b4fd..bf15685f37 100644 --- a/src/util/JoinAlgorithms/FindUndefRanges.h +++ b/src/util/JoinAlgorithms/FindUndefRanges.h @@ -168,7 +168,7 @@ auto findSmallerUndefRangesArbitrary(const auto& row, It begin, It end, struct FindSmallerUndefRanges { template auto operator()(const auto& row, It begin, It end, - bool& resultMightBeUnsorted) -> cppcoro::generator { + bool& resultMightBeUnsorted) const -> cppcoro::generator { size_t numLastUndefined = 0; assert(row.size() > 0); auto it = ql::ranges::rbegin(row); From 7ec8947c759514efdbd0a533a5c00a545d5ecc4c Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:24:02 +0100 Subject: [PATCH 11/30] Fix the many many segfaults. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 2 +- src/engine/GroupBy.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 230ca1cb68..bdccf14488 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -17,7 +17,7 @@ Bind::Bind(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Bind b) : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { _subtree = ExistsScan::addExistsScansToSubtree( - _bind._expression, std::move(subtree), getExecutionContext(), + _bind._expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 0fe65fd00e..cfa8621709 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -55,7 +55,7 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, auto sortColumns = computeSortColumns(subtree.get()); for (const auto& alias : _aliases) { - _subtree = ExistsScan::addExistsScansToSubtree( + subtree = ExistsScan::addExistsScansToSubtree( alias._expression, std::move(subtree), getExecutionContext(), cancellationHandle_); } From c03f3e59f2097c3f14bb9cb214eb6ddfadec2992 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 09:34:56 +0100 Subject: [PATCH 12/30] Fix another bug. Signed-off-by: Johannes Kalmbach --- src/engine/ExistsScan.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsScan.cpp index 26fde12984..c416d1dc41 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsScan.cpp @@ -96,8 +96,8 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { // TODO There are many other cases where a cheaper implementation can // be chosen, but we leave those for another PR, this is the most common case. namespace stdr = ql::ranges; - size_t numJoinColumns = joinColumnsLeft.size(); - AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.size()); + size_t numJoinColumns = joinColumnsLeft.numColumns(); + AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.numColumns()); bool isCheap = stdr::none_of( ad_utility::integerRange(numJoinColumns), [&](const auto& col) { return (stdr::any_of(joinColumnsRight.getColumn(col), From 2da52abc6aea83dac6eb55dc536ef1f4e184fb1b Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 10:55:21 +0100 Subject: [PATCH 13/30] Fix another bug. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 4 +- src/engine/CMakeLists.txt | 2 +- src/engine/{ExistsScan.cpp => ExistsJoin.cpp} | 26 ++++++------- src/engine/{ExistsScan.h => ExistsJoin.h} | 4 +- src/engine/Filter.cpp | 4 +- src/engine/GroupBy.cpp | 4 +- src/util/JoinAlgorithms/FindUndefRanges.h | 31 --------------- test/QueryPlannerTest.cpp | 13 ++++++- test/QueryPlannerTestHelpers.h | 7 ++++ test/SparqlAntlrParserTest.cpp | 39 +++++++++++++++++++ 10 files changed, 80 insertions(+), 54 deletions(-) rename src/engine/{ExistsScan.cpp => ExistsJoin.cpp} (90%) rename src/engine/{ExistsScan.h => ExistsJoin.h} (95%) diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index bdccf14488..276f04e9fc 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -5,7 +5,7 @@ #include "Bind.h" #include "engine/CallFixedSize.h" -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" @@ -16,7 +16,7 @@ Bind::Bind(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Bind b) : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { - _subtree = ExistsScan::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsScansToSubtree( _bind._expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index c724a8fb39..a3750a07e5 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -14,5 +14,5 @@ add_library(engine CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp - Describe.cpp ExistsScan.cpp) + Describe.cpp ExistsJoin.cpp) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2) diff --git a/src/engine/ExistsScan.cpp b/src/engine/ExistsJoin.cpp similarity index 90% rename from src/engine/ExistsScan.cpp rename to src/engine/ExistsJoin.cpp index c416d1dc41..d8d3f564d1 100644 --- a/src/engine/ExistsScan.cpp +++ b/src/engine/ExistsJoin.cpp @@ -2,7 +2,7 @@ // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/ExistsExpression.h" @@ -10,7 +10,7 @@ #include "util/JoinAlgorithms/JoinAlgorithms.h" // _____________________________________________________________________________ -ExistsScan::ExistsScan(QueryExecutionContext* qec, +ExistsJoin::ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable) @@ -24,16 +24,16 @@ ExistsScan::ExistsScan(QueryExecutionContext* qec, } // _____________________________________________________________________________ -string ExistsScan::getCacheKeyImpl() const { +string ExistsJoin::getCacheKeyImpl() const { return absl::StrCat("EXISTS SCAN left: ", left_->getCacheKey(), " right: ", right_->getCacheKey()); } // _____________________________________________________________________________ -string ExistsScan::getDescriptor() const { return "EXISTS scan"; } +string ExistsJoin::getDescriptor() const { return "EXISTS scan"; } // ____________________________________________________________________________ -VariableToColumnMap ExistsScan::computeVariableToColumnMap() const { +VariableToColumnMap ExistsJoin::computeVariableToColumnMap() const { auto res = left_->getVariableColumns(); AD_CONTRACT_CHECK( !res.contains(existsVariable_), @@ -43,18 +43,18 @@ VariableToColumnMap ExistsScan::computeVariableToColumnMap() const { } // ____________________________________________________________________________ -size_t ExistsScan::getResultWidth() const { +size_t ExistsJoin::getResultWidth() const { // We add one column to the input. return left_->getResultWidth() + 1; } // ____________________________________________________________________________ -vector ExistsScan::resultSortedOn() const { +vector ExistsJoin::resultSortedOn() const { return left_->resultSortedOn(); } // ____________________________________________________________________________ -float ExistsScan::getMultiplicity(size_t col) { +float ExistsJoin::getMultiplicity(size_t col) { if (col < getResultWidth() - 1) { return left_->getMultiplicity(col); } @@ -64,18 +64,18 @@ float ExistsScan::getMultiplicity(size_t col) { } // ____________________________________________________________________________ -uint64_t ExistsScan::getSizeEstimateBeforeLimit() { +uint64_t ExistsJoin::getSizeEstimateBeforeLimit() { return left_->getSizeEstimate(); } // ____________________________________________________________________________ -size_t ExistsScan::getCostEstimate() { +size_t ExistsJoin::getCostEstimate() { return left_->getCostEstimate() + right_->getCostEstimate() + left_->getSizeEstimate() + right_->getSizeEstimate(); } // ____________________________________________________________________________ -ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { +ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { auto leftRes = left_->getResult(); auto rightRes = right_->getResult(); const auto& left = leftRes->idTable(); @@ -139,7 +139,7 @@ ProtoResult ExistsScan::computeResult([[maybe_unused]] bool requestLaziness) { } // _____________________________________________________________________________ -std::shared_ptr ExistsScan::addExistsScansToSubtree( +std::shared_ptr ExistsJoin::addExistsScansToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle) { @@ -158,7 +158,7 @@ std::shared_ptr ExistsScan::addExistsScansToSubtree( auto pq = exists.argument(); auto tree = std::make_shared(qp.createExecutionTree(pq)); - subtree = ad_utility::makeExecutionTree( + subtree = ad_utility::makeExecutionTree( qec, std::move(subtree), std::move(tree), exists.variable()); } return subtree; diff --git a/src/engine/ExistsScan.h b/src/engine/ExistsJoin.h similarity index 95% rename from src/engine/ExistsScan.h rename to src/engine/ExistsJoin.h index dbd947d302..9b9c7483ce 100644 --- a/src/engine/ExistsScan.h +++ b/src/engine/ExistsJoin.h @@ -7,7 +7,7 @@ #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" -class ExistsScan : public Operation { +class ExistsJoin : public Operation { private: std::shared_ptr left_; std::shared_ptr right_; @@ -19,7 +19,7 @@ class ExistsScan : public Operation { std::vector> _matchedColumns; public: - ExistsScan(QueryExecutionContext* qec, + ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable); diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index ff8edc1fc1..9da7c12724 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -10,7 +10,7 @@ #include "backports/algorithm.h" #include "engine/CallFixedSize.h" -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" #include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/SparqlExpression.h" @@ -31,7 +31,7 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { - _subtree = ExistsScan::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsScansToSubtree( _expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); setPrefilterExpressionForChildren(); diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index cfa8621709..3e8af1cb29 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -9,7 +9,7 @@ #include #include "engine/CallFixedSize.h" -#include "engine/ExistsScan.h" +#include "engine/ExistsJoin.h" #include "engine/IndexScan.h" #include "engine/Join.h" #include "engine/LazyGroupBy.h" @@ -55,7 +55,7 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, auto sortColumns = computeSortColumns(subtree.get()); for (const auto& alias : _aliases) { - subtree = ExistsScan::addExistsScansToSubtree( + subtree = ExistsJoin::addExistsScansToSubtree( alias._expression, std::move(subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/util/JoinAlgorithms/FindUndefRanges.h b/src/util/JoinAlgorithms/FindUndefRanges.h index bf15685f37..6313bea887 100644 --- a/src/util/JoinAlgorithms/FindUndefRanges.h +++ b/src/util/JoinAlgorithms/FindUndefRanges.h @@ -196,35 +196,4 @@ struct FindSmallerUndefRanges { } }; constexpr FindSmallerUndefRanges findSmallerUndefRanges; -/* -template -auto findSmallerUndefRanges(const auto& row, It begin, It end, - bool& resultMightBeUnsorted) - -> cppcoro::generator { - size_t numLastUndefined = 0; - assert(row.size() > 0); - auto it = ql::ranges::rbegin(row); - auto rend = ql::ranges::rend(row); - for (; it < rend; ++it) { - if (*it != Id::makeUndefined()) { - break; - } - ++numLastUndefined; - } - - for (; it < rend; ++it) { - if (*it == Id::makeUndefined()) { - return findSmallerUndefRangesArbitrary(row, begin, end, - resultMightBeUnsorted); - } - } - if (numLastUndefined == 0) { - return findSmallerUndefRangesForRowsWithoutUndef(row, begin, end, - resultMightBeUnsorted); - } else { - return findSmallerUndefRangesForRowsWithUndefInLastColumns( - row, numLastUndefined, begin, end, resultMightBeUnsorted); - } -} -*/ } // namespace ad_utility diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 90462f3cc3..c7d806319e 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2906,10 +2906,21 @@ TEST(QueryPlanner, Describe) { } // ____________________________________________________________________________ -TEST(QueryPlanner, GroupByRedundanteParensAndVariables) { +TEST(QueryPlanner, GroupByRedundantParensAndVariables) { auto matcher = h::GroupBy({Variable{"?x"}}, {}, h::IndexScanFromStrings("?x", "?y", "?z")); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY (?x)", matcher); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY ?x ?x", matcher); h::expect("SELECT ?x { ?x ?y ?z} GROUP BY ?x ?x (?x)", matcher); } + +// ____________________________________________________________________________ +TEST(QueryPlanner, Exists) { + auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); + auto a = h::IndexScanFromStrings("?x", "?y", "?z"); + h::expect( + "SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::Filter("EXISTS {?a ?b ?c}", + h::ExistsJoin(h::IndexScanFromStrings("?x", "?y", "?z"), + h::IndexScanFromStrings("?a", "?b", "?c")))); +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index c300bf0d5f..f53f30c5bb 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -15,6 +15,7 @@ #include "engine/CartesianProductJoin.h" #include "engine/CountAvailablePredicates.h" #include "engine/Describe.h" +#include "engine/ExistsJoin.h" #include "engine/Filter.h" #include "engine/GroupBy.h" #include "engine/IndexScan.h" @@ -405,6 +406,12 @@ inline QetMatcher Describe( AD_PROPERTY(::Describe, getDescribe, describeMatcher))); } +// Match an `ExistsJoin` +inline QetMatcher ExistsJoin(const QetMatcher& leftChild, + const QetMatcher& rightChild) { + return RootOperation<::ExistsJoin>(AllOf(children(leftChild, rightChild))); +} + // inline QetMatcher QetWithWarnings( const std::vector& warningSubstrings, diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 0803f96f03..f5a65169b2 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -4,6 +4,7 @@ // Julian Mundhahs // Hannah Bast +#include #include #include @@ -14,6 +15,7 @@ #include "./SparqlExpressionTestHelpers.h" #include "./util/GTestHelpers.h" #include "./util/TripleComponentTestHelpers.h" +#include "QueryPlannerTestHelpers.h" #include "SparqlAntlrParserTestHelpers.h" #include "engine/sparqlExpressions/CountStarExpression.h" #include "engine/sparqlExpressions/GroupConcatExpression.h" @@ -1860,6 +1862,43 @@ TEST(SparqlParser, binaryStringExpressions) { expectBuiltInCall("STRBEFORE(?x, ?y)", makeMatcher(&makeStrBeforeExpression)); } +// Matchers for EXISTS and NOT EXISTS functions. +namespace existsTestHelpers { +using namespace sparqlExpression; +using namespace ::testing; + +// Match an EXISTS function +auto existsMatcher(Matcher pattern) { + return Pointee(WhenDynamicCastTo( + AD_PROPERTY(ExistsExpression, argument, pattern))); +} +// Match a NOT EXISTS function +auto notExistsMatcher(Matcher pattern) { + return builtInCallTestHelpers::matchNaryWithChildrenMatchers( + &makeUnaryNegateExpression, existsMatcher(pattern)); +} +} // namespace existsTestHelpers + +// _____________________________________________________________________________ +TEST(SparqlParser, Exists) { + using namespace existsTestHelpers; + auto expectBuiltInCall = ExpectCompleteParse<&Parser::builtInCall>{}; + // A matcher that matches the query `SELECT * { ?x ?foo}`, where the + // FROM and FROM NAMED clauses can still be specified via arguments. + using Graphs = ScanSpecificationAsTripleComponent::Graphs; + auto selectABarFooMatcher = [](Graphs defaultGraphs = std::nullopt, + Graphs namedGraphs = std::nullopt) { + return testing::AllOf(m::SelectQuery( + m::AsteriskSelect(), + m::GraphPattern(m::Triples({{Var{"?a"}, "", Var{"?foo"}}})), + defaultGraphs, namedGraphs)); + }; + expectBuiltInCall("EXISTS {?a ?foo}", + existsMatcher(selectABarFooMatcher())); + expectBuiltInCall("NOT EXISTS {?a ?foo}", + notExistsMatcher(selectABarFooMatcher())); +} + namespace aggregateTestHelpers { using namespace sparqlExpression; From cbbc771c64251f3ec69b342bbcda02fc691a5c74 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 11:00:59 +0100 Subject: [PATCH 14/30] Fix another bug. Signed-off-by: Johannes Kalmbach --- test/QueryPlannerTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index c7d806319e..8d68a4b1e5 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2917,7 +2917,7 @@ TEST(QueryPlanner, GroupByRedundantParensAndVariables) { // ____________________________________________________________________________ TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); - auto a = h::IndexScanFromStrings("?x", "?y", "?z"); + auto ab = h::IndexScanFromStrings("?x", "?y", "?z"); h::expect( "SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", h::Filter("EXISTS {?a ?b ?c}", From 91e5802c33d798e1b9cb49326079a9ddba1b902a Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 11:03:12 +0100 Subject: [PATCH 15/30] blub. Signed-off-by: Johannes Kalmbach --- test/QueryPlannerTest.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 8d68a4b1e5..6f8f40d47e 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2917,10 +2917,7 @@ TEST(QueryPlanner, GroupByRedundantParensAndVariables) { // ____________________________________________________________________________ TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); - auto ab = h::IndexScanFromStrings("?x", "?y", "?z"); - h::expect( - "SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", - h::Filter("EXISTS {?a ?b ?c}", - h::ExistsJoin(h::IndexScanFromStrings("?x", "?y", "?z"), - h::IndexScanFromStrings("?a", "?b", "?c")))); + auto abc = h::IndexScanFromStrings("?a", "?b", "?c"); + h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::Filter("EXISTS {?a ?b ?c}", h::ExistsJoin(xyz, abc))); } From c3a9a7df4b46ac5e0e720c2ca4a40e9d1f5a0b0e Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 12:31:54 +0100 Subject: [PATCH 16/30] Added some more tests. Signed-off-by: Johannes Kalmbach --- .../sparqlParser/SparqlQleverVisitor.cpp | 14 +++++---- src/parser/sparqlParser/SparqlQleverVisitor.h | 1 + test/QueryPlannerTest.cpp | 29 +++++++++++++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 903544c96a..32b050db9b 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -268,6 +268,7 @@ ParsedQuery Visitor::visit(Parser::ConstructQueryContext* ctx) { ParsedQuery query; query.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = query.datasetClauses_; if (ctx->constructTemplate()) { query._clause = visit(ctx->constructTemplate()) .value_or(parsedQuery::ConstructClause{}); @@ -303,9 +304,9 @@ ParsedQuery Visitor::visit(Parser::DescribeQueryContext* ctx) { } // Parse the FROM and FROM NAMED clauses. - auto datasetClauses = parsedQuery::DatasetClauses::fromClauses( + activeDatasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); - describeClause.datasetClauses_ = datasetClauses; + describeClause.datasetClauses_ = activeDatasetClauses_; // Parse the WHERE clause and construct a SELECT query from it. For `DESCRIBE // *`, add each visible variable as a resource to describe. @@ -336,7 +337,7 @@ ParsedQuery Visitor::visit(Parser::DescribeQueryContext* ctx) { parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); parsedQuery_._rootGraphPattern._graphPatterns.emplace_back( std::move(describeClause)); - parsedQuery_.datasetClauses_ = datasetClauses; + parsedQuery_.datasetClauses_ = activeDatasetClauses_; auto constructClause = ParsedQuery::ConstructClause{}; using G = GraphTerm; using V = Variable; @@ -352,6 +353,7 @@ ParsedQuery Visitor::visit(Parser::AskQueryContext* ctx) { parsedQuery_._clause = ParsedQuery::AskClause{}; parsedQuery_.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = parsedQuery_.datasetClauses_; visitWhereClause(ctx->whereClause(), parsedQuery_); // NOTE: It can make sense to have solution modifiers with an ASK query, for // example, a GROUP BY with a HAVING. @@ -595,6 +597,8 @@ ParsedQuery Visitor::visit(Parser::ModifyContext* ctx) { }; AD_CORRECTNESS_CHECK(visibleVariables_.empty()); auto graphPattern = visit(ctx->groupGraphPattern()); + parsedQuery_.datasetClauses_ = + parsedQuery::DatasetClauses::fromClauses(visitVector(ctx->usingClause())); parsedQuery_._rootGraphPattern = std::move(graphPattern); parsedQuery_.registerVariablesVisibleInQueryBody(visibleVariables_); visibleVariables_.clear(); @@ -605,8 +609,6 @@ ParsedQuery Visitor::visit(Parser::ModifyContext* ctx) { checkTriples(op.toDelete_); visitIf(&op.with_, ctx->iri()); parsedQuery_._clause = parsedQuery::UpdateClause{op}; - parsedQuery_.datasetClauses_ = - parsedQuery::DatasetClauses::fromClauses(visitVector(ctx->usingClause())); return parsedQuery_; } @@ -1174,6 +1176,7 @@ ParsedQuery Visitor::visit(Parser::SelectQueryContext* ctx) { parsedQuery_._clause = visit(ctx->selectClause()); parsedQuery_.datasetClauses_ = parsedQuery::DatasetClauses::fromClauses( visitVector(ctx->datasetClause())); + activeDatasetClauses_ = parsedQuery_.datasetClauses_; visitWhereClause(ctx->whereClause(), parsedQuery_); parsedQuery_.addSolutionModifiers(visit(ctx->solutionModifier())); return parsedQuery_; @@ -2434,6 +2437,7 @@ ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); query.selectClause().setAsterisk(); query._rootGraphPattern = std::move(group); + query.datasetClauses_ = activeDatasetClauses_; visibleVariables_ = std::move(visibleVariablesSoFar); auto exists = std::make_unique(std::move(query)); diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 3e7b63c3ad..3d7aa0dd86 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -78,6 +78,7 @@ class SparqlQleverVisitor { // query. This may contain duplicates. A variable is added via // `addVisibleVariable`. std::vector visibleVariables_{}; + ParsedQuery::DatasetClauses activeDatasetClauses_; PrefixMap prefixMap_{}; // We need to remember the prologue (prefix declarations) when we encounter it // because we need it when we encounter a SERVICE query. When there is no diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 6f8f40d47e..518833bb02 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2918,6 +2918,35 @@ TEST(QueryPlanner, GroupByRedundantParensAndVariables) { TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); auto abc = h::IndexScanFromStrings("?a", "?b", "?c"); + using V = Variable; + // Simple tests for EXISTS with FILTER, BIND, and GROUP BY. h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", h::Filter("EXISTS {?a ?b ?c}", h::ExistsJoin(xyz, abc))); + h::expect("SELECT * { ?x ?y ?z BIND(EXISTS {?a ?b ?c} as ?bound)}", + h::Bind(h::ExistsJoin(xyz, abc), "EXISTS {?a ?b ?c}", + Variable("?bound"))); + h::expect( + "SELECT ?x (SAMPLE(EXISTS{?a ?b ?c}) as ?s) { ?x ?y ?z } GROUP BY ?x", + h::GroupBy({V{"?x"}}, {"(SAMPLE(EXISTS{?a ?b ?c}) as ?s)"}, + h::ExistsJoin(xyz, abc))); + + // Test the interaction of FROM [NAMED] with EXISTS. + + using H = ad_utility::HashSet; + auto xyzg = h::IndexScanFromStrings("?x", "?y", "?z", {}, H{""}); + auto abcg = h::IndexScanFromStrings("?a", "?b", "?c", {}, H{""}); + + auto existsJoin = h::ExistsJoin(xyzg, abcg); + auto filter = h::Filter("EXISTS {?a ?b ?c}", existsJoin); + + // Test all different kinds of queries. + // TODO There is a more elegant way to reduce the code duplication + // (use a lambda that only changes the beginning of the query). + h::expect("SELECT * FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); + h::expect("ASK FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); + h::expect( + "CONSTRUCT { } FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + filter); + h::expect("Describe ?x FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::Describe(::testing::_, filter)); } From 0adbfa609e5a22c799e7ec6c737a58637697c198 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 8 Jan 2025 17:25:35 +0100 Subject: [PATCH 17/30] Add some tests at least for the parser and query planner. Signed-off-by: Johannes Kalmbach --- .../sparqlParser/SparqlQleverVisitor.cpp | 1 - test/QueryPlannerTest.cpp | 18 +++++++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 32b050db9b..41e297120c 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -7,7 +7,6 @@ #include "parser/sparqlParser/SparqlQleverVisitor.h" -#include #include #include diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 518833bb02..89601732e8 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2930,17 +2930,13 @@ TEST(QueryPlanner, Exists) { h::GroupBy({V{"?x"}}, {"(SAMPLE(EXISTS{?a ?b ?c}) as ?s)"}, h::ExistsJoin(xyz, abc))); - // Test the interaction of FROM [NAMED] with EXISTS. - + // Test the interaction of FROM with EXISTS. using H = ad_utility::HashSet; auto xyzg = h::IndexScanFromStrings("?x", "?y", "?z", {}, H{""}); auto abcg = h::IndexScanFromStrings("?a", "?b", "?c", {}, H{""}); auto existsJoin = h::ExistsJoin(xyzg, abcg); auto filter = h::Filter("EXISTS {?a ?b ?c}", existsJoin); - - // Test all different kinds of queries. - // TODO There is a more elegant way to reduce the code duplication // (use a lambda that only changes the beginning of the query). h::expect("SELECT * FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); h::expect("ASK FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); @@ -2949,4 +2945,16 @@ TEST(QueryPlanner, Exists) { filter); h::expect("Describe ?x FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", h::Describe(::testing::_, filter)); + + // Test the interaction of FROM NAMES with EXISTS + auto varG = std::vector{Variable{"?g"}}; + std::vector graphCol{ADDITIONAL_COLUMN_GRAPH_ID}; + auto uvcg = + h::IndexScanFromStrings("?u", "?v", "?c", {}, H{""}, varG, graphCol); + existsJoin = h::ExistsJoin(xyzg, h::UnorderedJoins(abcg, uvcg)); + filter = h::Filter("EXISTS {?a ?b ?c. GRAPH ?g { ?u ?v ?c}}", existsJoin); + h::expect( + "SELECT * FROM FROM NAMED { ?x ?y ?z FILTER EXISTS {?a ?b ?c. " + "GRAPH ?g { ?u ?v ?c}}}", + filter); } From babd2940a203258cd95fb9dc332c93e24476ebc1 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 09:38:21 +0100 Subject: [PATCH 18/30] Some more tests. As a next step, I want to write some comments. Signed-off-by: Johannes Kalmbach --- test/engine/CMakeLists.txt | 1 + test/engine/ExistsJoinTest.cpp | 94 ++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 test/engine/ExistsJoinTest.cpp diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index fef9ffed39..41b2b463ad 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -12,3 +12,4 @@ addLinkAndDiscoverTest(BindTest engine) addLinkAndRunAsSingleTest(SpatialJoinAlgorithmsTest engine) addLinkAndDiscoverTestSerial(QueryExecutionTreeTest engine) addLinkAndDiscoverTestSerial(DescribeTest engine) +addLinkAndDiscoverTestSerial(ExistsJoinTest engine) diff --git a/test/engine/ExistsJoinTest.cpp b/test/engine/ExistsJoinTest.cpp new file mode 100644 index 0000000000..af72e5fbb6 --- /dev/null +++ b/test/engine/ExistsJoinTest.cpp @@ -0,0 +1,94 @@ +// Copyright 2024, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach + +#include + +#include "../util/GTestHelpers.h" +#include "../util/IdTableHelpers.h" +#include "../util/IndexTestHelpers.h" +#include "engine/ExistsJoin.h" +#include "engine/IndexScan.h" +#include "engine/NeutralElementOperation.h" +#include "engine/QueryExecutionTree.h" + +using namespace ad_utility::testing; + +namespace { +void testExists(const VectorTable& leftInput, const VectorTable& rightInput, + std::vector expectedAsBool, size_t numJoinColumns) { + AD_CORRECTNESS_CHECK(leftInput.size() == expectedAsBool.size()); + auto left = makeIdTableFromVector(leftInput); + auto right = makeIdTableFromVector(rightInput); + AD_CORRECTNESS_CHECK(left.numColumns() >= numJoinColumns); + AD_CORRECTNESS_CHECK(right.numColumns() >= numJoinColumns); + + auto qec = getQec(); + using V = Variable; + using Vars = std::vector>; + + // TODO Support more than one join column. + // TODO also randomly permute the join columns. + + auto joinCol = [](size_t i) { return V{absl::StrCat("?joinCol_", i)}; }; + auto nonJoinCol = [i = 0]() mutable { + return V{absl::StrCat("?nonJoinCol_", i++)}; + }; + + auto makeChild = [&](const IdTable& input) { + Vars vars; + for (size_t i : ad_utility::integerRange(numJoinColumns)) { + vars.push_back(joinCol(i)); + }; + for ([[maybe_unused]] size_t i : + ql::views::iota(numJoinColumns, input.numColumns())) { + vars.push_back(nonJoinCol()); + } + return ad_utility::makeExecutionTree(qec, input.clone(), + vars); + }; + + auto exists = + ExistsJoin{qec, makeChild(left), makeChild(right), V{"?exists"}}; + + EXPECT_EQ(exists.getResultWidth(), left.numColumns() + 1); + + auto res = exists.computeResultOnlyForTesting(); + const auto& table = res.idTable(); + ASSERT_EQ(table.numRows(), left.size()); + IdTable expected = left.clone(); + expected.addEmptyColumn(); + ql::ranges::transform(expectedAsBool, expected.getColumn(2).begin(), + &Id::makeFromBool); + EXPECT_THAT(table, matchesIdTable(expected)); +} +} // namespace + +TEST(Exists, computeResult) { + // Single join column. + testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {true, false, true}, 1); + + // UNDEF matches everything + auto U = Id::makeUndefined(); + testExists({{U, 13}, {3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {true, true, false, true}, 1); + testExists({{3, 6}, {4, 7}, {5, 8}}, {{U, 15}}, {true, true, true}, 1); + + // Two join columns + testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, + {false, false, false}, 2); + testExists({{3, 6}, {4, 7}, {5, 8}}, + {{3, 6, 11}, {3, 19, 7}, {4, 8, 0}, {5, 8, 37}}, + {true, false, true}, 2); + + // Two join columns with UNDEF + testExists({{2, 2}, {3, U}, {4, 8}, {5, 8}}, + {{U, 8}, {3, 15}, {3, 19}, {5, U}, {5, 37}}, + {false, true, true, true}, 2); + testExists({{U, U}}, {{13, 17}}, {true}, 2); + testExists({{13, 17}, {25, 38}}, {{U, U}}, {true, true}, 2); + + // TODO Add tests with unsorted inputs. + // TODO Test empty inputs on one side. +} From 6766af39ca5e073d1669807dfa3e832a29fe964c Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 10:35:08 +0100 Subject: [PATCH 19/30] Added some comments. Signed-off-by: Johannes Kalmbach --- src/engine/Bind.cpp | 2 +- src/engine/ExistsJoin.cpp | 31 +++++++++++++++---- src/engine/ExistsJoin.h | 22 ++++++++++--- src/engine/Filter.cpp | 4 +-- src/engine/GroupBy.cpp | 2 +- .../sparqlExpressions/ExistsExpression.cpp | 5 --- .../sparqlExpressions/ExistsExpression.h | 29 +++++++++++------ 7 files changed, 66 insertions(+), 29 deletions(-) delete mode 100644 src/engine/sparqlExpressions/ExistsExpression.cpp diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 276f04e9fc..ed98495d72 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -16,7 +16,7 @@ Bind::Bind(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Bind b) : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) { - _subtree = ExistsJoin::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsJoinsToSubtree( _bind._expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp index d8d3f564d1..7ca230c799 100644 --- a/src/engine/ExistsJoin.cpp +++ b/src/engine/ExistsJoin.cpp @@ -1,4 +1,4 @@ -// Copyright 2023, University of Freiburg, +// Copyright 2025, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach @@ -25,12 +25,12 @@ ExistsJoin::ExistsJoin(QueryExecutionContext* qec, // _____________________________________________________________________________ string ExistsJoin::getCacheKeyImpl() const { - return absl::StrCat("EXISTS SCAN left: ", left_->getCacheKey(), + return absl::StrCat("EXISTS JOIN left: ", left_->getCacheKey(), " right: ", right_->getCacheKey()); } // _____________________________________________________________________________ -string ExistsJoin::getDescriptor() const { return "EXISTS scan"; } +string ExistsJoin::getDescriptor() const { return "Exists Join"; } // ____________________________________________________________________________ VariableToColumnMap ExistsJoin::computeVariableToColumnMap() const { @@ -70,6 +70,7 @@ uint64_t ExistsJoin::getSizeEstimateBeforeLimit() { // ____________________________________________________________________________ size_t ExistsJoin::getCostEstimate() { + // The implementation is a linear zipper join. return left_->getCostEstimate() + right_->getCostEstimate() + left_->getSizeEstimate() + right_->getSizeEstimate(); } @@ -81,9 +82,16 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { const auto& left = leftRes->idTable(); const auto& right = rightRes->idTable(); + // We reuse the generic `zipperJoinWithUndef` utility in the following way: + // It has (among others) two callbacks: One for each matching pair of rows + // from left and right, and one for rows in the left input that have no + // matching counterpart in the right input. The first callback can be a noop, + // and the second callback gives us exactly `NOT EXISTS`. + + // Only extract the join columns from both inputs to make the following code + // easier. ad_utility::JoinColumnMapping joinColumnData{joinColumns_, left.numColumns(), right.numColumns()}; - IdTableView<0> joinColumnsLeft = left.asColumnSubsetView(joinColumnData.jcsLeft()); IdTableView<0> joinColumnsRight = @@ -105,15 +113,20 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { (stdr::any_of(joinColumnsLeft.getColumn(col), &Id::isUndefined)); }); - auto noopRowAdder = [](auto&&...) {}; + // Nothing to do for the actual matches. + auto noopRowAdder = ad_utility::noop; + // Store the indices of rows for which `exists` is `false`. std::vector> notExistsIndices{ allocator()}; + // The callback is called with iterators, so we convert them back to indices. auto actionForNotExisting = [¬ExistsIndices, begin = joinColumnsLeft.begin()](const auto& itLeft) { notExistsIndices.push_back(itLeft - begin); }; + // Run the actual zipper join, with the possible optimization if we know, that + // there can be no UNDEF values. auto checkCancellationLambda = [this] { checkCancellation(); }; auto runZipperJoin = [&](auto findUndef) { [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( @@ -135,16 +148,22 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { for (size_t notExistsIndex : notExistsIndices) { existsCol[notExistsIndex] = Id::makeFromBool(false); } + + // The result is a copy of the left input + and additional columns with only + // boolean values, so the local vocab of the left input is sufficient. return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; } // _____________________________________________________________________________ -std::shared_ptr ExistsJoin::addExistsScansToSubtree( +std::shared_ptr ExistsJoin::addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle) { + // First extract all the `EXISTS` functions from the expression. std::vector existsExpressions; expression.getPimpl()->getExistsExpressions(existsExpressions); + + // For each of the EXISTS functions add one `ExistsJoin` for (auto* expr : existsExpressions) { const auto& exists = dynamic_cast(*expr); diff --git a/src/engine/ExistsJoin.h b/src/engine/ExistsJoin.h index 9b9c7483ce..4ff44fe94c 100644 --- a/src/engine/ExistsJoin.h +++ b/src/engine/ExistsJoin.h @@ -7,28 +7,42 @@ #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" +// The implementation of the SPARQL `EXISTS` function. It takes two subtrees, +// and returns the left subtree with an additional boolean column that is `true` +// iff at least one matching row is contained in the right subtree. class ExistsJoin : public Operation { private: + // The left and right child. std::shared_ptr left_; std::shared_ptr right_; std::vector> joinColumns_; + // The variable of the added result column. Variable existsVariable_; - vector _multiplicities; - std::vector> _matchedColumns; - public: + // Constructor. The `existsVariable` (the variable for the added boolean + // column) must not yet be bound by `left`. ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable); - static std::shared_ptr addExistsScansToSubtree( + // For a given subtree and a given expression, extract all the + // `ExistsExpressions` from the expression and add one `ExistsJoin` per + // `ExistsExpression` to the subtree. The left side of the `ExistsJoin` is the + // input subtree, the right hand side of the `ExistsJoin` as well as the + // variable to which the result is bound are extracted from the + // `ExistsExpression`. The returned subtree can then be used to evaluate the + // `expression`. Note: `ExistsExpression` is a simple dummy that only reads + // the values of the column that is added by the `ExistsJoin`. + static std::shared_ptr addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle); + // All following functions are inherited from `Operation`, see there for + // comments. protected: string getCacheKeyImpl() const override; diff --git a/src/engine/Filter.cpp b/src/engine/Filter.cpp index 9da7c12724..08393d9fb5 100644 --- a/src/engine/Filter.cpp +++ b/src/engine/Filter.cpp @@ -12,11 +12,9 @@ #include "engine/CallFixedSize.h" #include "engine/ExistsJoin.h" #include "engine/QueryExecutionTree.h" -#include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/SparqlExpression.h" #include "engine/sparqlExpressions/SparqlExpressionGenerators.h" #include "engine/sparqlExpressions/SparqlExpressionValueGetters.h" -#include "sparqlExpressions/ExistsExpression.h" using std::endl; using std::string; @@ -31,7 +29,7 @@ Filter::Filter(QueryExecutionContext* qec, : Operation(qec), _subtree(std::move(subtree)), _expression{std::move(expression)} { - _subtree = ExistsJoin::addExistsScansToSubtree( + _subtree = ExistsJoin::addExistsJoinsToSubtree( _expression, std::move(_subtree), getExecutionContext(), cancellationHandle_); setPrefilterExpressionForChildren(); diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 3e8af1cb29..65c7b85d11 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -55,7 +55,7 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, auto sortColumns = computeSortColumns(subtree.get()); for (const auto& alias : _aliases) { - subtree = ExistsJoin::addExistsScansToSubtree( + subtree = ExistsJoin::addExistsJoinsToSubtree( alias._expression, std::move(subtree), getExecutionContext(), cancellationHandle_); } diff --git a/src/engine/sparqlExpressions/ExistsExpression.cpp b/src/engine/sparqlExpressions/ExistsExpression.cpp deleted file mode 100644 index 6737d3ed7b..0000000000 --- a/src/engine/sparqlExpressions/ExistsExpression.cpp +++ /dev/null @@ -1,5 +0,0 @@ -// -// Created by kalmbacj on 1/7/25. -// - -#include "ExistsExpression.h" diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index 343c195e82..1313b342b0 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -1,6 +1,6 @@ -// -// Created by kalmbacj on 1/7/25. -// +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach #pragma once @@ -9,19 +9,28 @@ #include "engine/sparqlExpressions/SparqlExpression.h" #include "parser/ParsedQuery.h" +// The expression that corresponds to the `EXISTS` function. +// The implementation only reads the value of a precomputed variable. The actual +// computation of EXISTS is done by the `ExistsJoin` class. namespace sparqlExpression { class ExistsExpression : public SparqlExpression { private: + // The argument (a group graph pattern) of the EXISTS. This is set during the + // parsing and is required and read by the `ExistsJoin` class. ParsedQuery argument_; + + // Each `ExistsExpression` has a unique index and a unique variable name that + // is used to communicate between the `ExistsExpression` and the `ExistsJoin`. static inline std::atomic indexCounter_ = 0; size_t index_ = ++indexCounter_; Variable variable_{absl::StrCat("?ql_internal_exists_", index_)}; public: + explicit ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} const auto& argument() const { return argument_; } const auto& variable() const { return variable_; } - ExistsExpression(ParsedQuery query) : argument_{std::move(query)} {} + // Evaluate only reads the variable which is written by the `ExistsJoin`. ExpressionResult evaluate(EvaluationContext* context) const override { AD_CONTRACT_CHECK(context->_variableToColumnMap.contains(variable_)); return variable_; @@ -31,17 +40,19 @@ class ExistsExpression : public SparqlExpression { [[nodiscard]] string getCacheKey( const VariableToColumnMap& varColMap) const override { if (varColMap.contains(variable_)) { - return absl::StrCat("EXISTS WITH COL ", + return absl::StrCat("ExistsExpression col# ", varColMap.at(variable_).columnIndex_); } else { - // This means that the necessary `ExistsScan` hasn't been set up yet. - // It is not possible to cache such incomplete operations, so we return - // a random cache key. + // This means that the necessary `ExistsJoin` hasn't been set up yet. This + // can for example happen if the parsing (which sets up the + // `ExistsExpression`) is completed, but the query planning (which sets up + // the `ExistsJoin` is still in progress). It is not possible to cache + // such incomplete operations, so we return a random cache key. return std::to_string(ad_utility::FastRandomIntGenerator{}()); } } - // ____________________________________________________________________________ + // This is in fact an `ExistsExpression`. bool isExistsExpression() const override { return true; } private: From 3a574eab1a8ad78482ff2f781bb6ecad108abc7d Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 10:55:08 +0100 Subject: [PATCH 20/30] This is commented and very clean. The only thing that is missing, is some corner case tests, and maybe cleaning up the parsing of the active dataset clauses. Signed-off-by: Johannes Kalmbach --- src/engine/GroupBy.cpp | 21 +++++++++++++----- .../sparqlParser/SparqlQleverVisitor.cpp | 22 ++++++++++++------- src/parser/sparqlParser/SparqlQleverVisitor.h | 3 +++ 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 65c7b85d11..46ff7a410a 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -373,6 +373,8 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { } if (useHashMapOptimization) { + // Helper lambda that calls `computeGroupByForHashMapOptimization` for the + // given `subresults`. auto computeWithHashMap = [this, &metadataForUnsequentialData, &groupByCols](auto&& subresults) { auto doCompute = [&] { @@ -383,9 +385,10 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) { return ad_utility::callFixedSize(groupByCols.size(), doCompute); }; + // Now call `computeWithHashMap` and return the result. It expects a range + // of results, so if the result is fully materialized, we create an array + // with a single element. if (subresult->isFullyMaterialized()) { - // `computeWithHashMap` takes a range, so we artificially create one with - // a single input. return computeWithHashMap( std::array{std::pair{std::cref(subresult->idTable()), std::cref(subresult->localVocab())}}); @@ -1513,29 +1516,35 @@ Result GroupBy::computeGroupByForHashMapOptimization( NUM_GROUP_COLUMNS == 0); LocalVocab localVocab; - // Initialize aggregation data + // Initialize the data for the aggregates of the GROUP BY operation. HashMapAggregationData aggregationData( getExecutionContext()->getAllocator(), aggregateAliases, columnIndices.size()); + // Process the input blocks (pairs of `IdTable` and `LocalVocab`) one after + // the other. ad_utility::Timer lookupTimer{ad_utility::Timer::Stopped}; ad_utility::Timer aggregationTimer{ad_utility::Timer::Stopped}; for (const auto& [inputTableRef, inputLocalVocabRef] : subresults) { - // Also support `std::reference_wrapper` as the input. const IdTable& inputTable = inputTableRef; const LocalVocab& inputLocalVocab = inputLocalVocabRef; + // Merge the local vocab of each input block. + // + // NOTE: If the input blocks have very similar or even identical non-empty + // local vocabs, no deduplication is performed. localVocab.mergeWith(std::span{&inputLocalVocab, 1}); - // Initialize evaluation context + // Setup the `EvaluationContext` for this input block. sparqlExpression::EvaluationContext evaluationContext( *getExecutionContext(), _subtree->getVariableColumns(), inputTable, getExecutionContext()->getAllocator(), localVocab, cancellationHandle_, deadline_); - evaluationContext._groupedVariables = ad_utility::HashSet{ _groupByVariables.begin(), _groupByVariables.end()}; evaluationContext._isPartOfGroupBy = true; + // Iterate of the rows of this input block. Process (up to) + // `GROUP_BY_HASH_MAP_BLOCK_SIZE` rows at a time. for (size_t i = 0; i < inputTable.size(); i += GROUP_BY_HASH_MAP_BLOCK_SIZE) { checkCancellation(); diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index 41e297120c..6c1bf6d7eb 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -25,7 +25,6 @@ #include "engine/sparqlExpressions/SampleExpression.h" #include "engine/sparqlExpressions/StdevExpression.h" #include "engine/sparqlExpressions/UuidExpressions.h" -#include "generated/SparqlAutomaticParser.h" #include "global/Constants.h" #include "global/RuntimeParameters.h" #include "parser/GraphPatternOperation.h" @@ -1370,7 +1369,6 @@ SparqlFilter Visitor::visit(Parser::FilterRContext* ctx) { // expression contains unbound variables, because the variables of the FILTER // might be bound after the filter appears in the query (which is perfectly // legal). - auto pimpl = visitExpressionPimpl(ctx->constraint()); return SparqlFilter{visitExpressionPimpl(ctx->constraint())}; } @@ -2429,17 +2427,25 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { // ____________________________________________________________________________________ ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, bool negate) { + // The argument of the EXISTS is a completely independent GroupGraphPattern + // (except for the FROM [NAMED] clauses), so we have to back up and restore + // all global state when parsing EXISTS. auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); auto visibleVariablesSoFar = std::move(visibleVariables_); visibleVariables_.clear(); + + // Parse the argument of EXISTS. auto group = visit(pattern); - ParsedQuery query = std::exchange(parsedQuery_, std::move(queryBackup)); - query.selectClause().setAsterisk(); - query._rootGraphPattern = std::move(group); - query.datasetClauses_ = activeDatasetClauses_; + ParsedQuery argumentOfExists = + std::exchange(parsedQuery_, std::move(queryBackup)); + argumentOfExists.selectClause().setAsterisk(); + argumentOfExists._rootGraphPattern = std::move(group); + + // EXISTS inherits the FROM [NAMED] clauses from the outer argumentOfExists. + argumentOfExists.datasetClauses_ = activeDatasetClauses_; visibleVariables_ = std::move(visibleVariablesSoFar); - auto exists = - std::make_unique(std::move(query)); + auto exists = std::make_unique( + std::move(argumentOfExists)); if (negate) { return sparqlExpression::makeUnaryNegateExpression(std::move(exists)); } else { diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 3d7aa0dd86..2fd0d6bc9b 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -78,6 +78,9 @@ class SparqlQleverVisitor { // query. This may contain duplicates. A variable is added via // `addVisibleVariable`. std::vector visibleVariables_{}; + + // The FROM [NAMED] clauses of the query that is currently being parsed. + // Those are currently needed when parsing an EXISTS clause inside the query. ParsedQuery::DatasetClauses activeDatasetClauses_; PrefixMap prefixMap_{}; // We need to remember the prologue (prefix declarations) when we encounter it From 5809be2bdf69cc8b701496edb38874380ebf2b97 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Thu, 9 Jan 2025 16:04:45 +0100 Subject: [PATCH 21/30] better tests. Signed-off-by: Johannes Kalmbach --- test/QueryPlannerTest.cpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 89601732e8..c2927b29e9 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -2918,6 +2918,8 @@ TEST(QueryPlanner, GroupByRedundantParensAndVariables) { TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); auto abc = h::IndexScanFromStrings("?a", "?b", "?c"); + auto def = h::IndexScanFromStrings("?d", "?e", "?f"); + auto ghi = h::IndexScanFromStrings("?g", "?h", "?i"); using V = Variable; // Simple tests for EXISTS with FILTER, BIND, and GROUP BY. h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", @@ -2930,6 +2932,26 @@ TEST(QueryPlanner, Exists) { h::GroupBy({V{"?x"}}, {"(SAMPLE(EXISTS{?a ?b ?c}) as ?s)"}, h::ExistsJoin(xyz, abc))); + // Similar tests, but with multiple EXISTS clauses + auto existsAbcDef = h::ExistsJoin(h::ExistsJoin(xyz, abc), def); + h::expect( + "SELECT * { ?x ?y ?z FILTER (EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f})}", + h::Filter("EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}", existsAbcDef)); + ; + h::expect( + "SELECT * { ?x ?y ?z BIND(EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f} as " + "?bound)}", + h::Bind(existsAbcDef, "EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}", + Variable("?bound"))); + + h::expect( + "SELECT ?x (SAMPLE(EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}) as ?s) " + "(SAMPLE(EXISTS{?g ?h ?i}) as ?t) { ?x ?y ?z } GROUP BY ?x", + h::GroupBy({V{"?x"}}, + {"(SAMPLE(EXISTS {?a ?b ?c} || EXISTS {?d ?e ?f}) as ?s)", + "(SAMPLE(EXISTS{?g ?h ?i}) as ?t)"}, + h::ExistsJoin(existsAbcDef, ghi))); + // Test the interaction of FROM with EXISTS. using H = ad_utility::HashSet; auto xyzg = h::IndexScanFromStrings("?x", "?y", "?z", {}, H{""}); From 52943570743cb6c9db292f6331e1b8304ab32379 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 10 Jan 2025 03:59:43 +0100 Subject: [PATCH 22/30] Made a pass over `ExistsJoin.h` and `ExistsJoin.cpp` --- src/engine/ExistsJoin.cpp | 69 ++++++++++++++++++++++++--------------- src/engine/ExistsJoin.h | 28 +++++++++------- 2 files changed, 58 insertions(+), 39 deletions(-) diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp index 7ca230c799..4e0b3b5bde 100644 --- a/src/engine/ExistsJoin.cpp +++ b/src/engine/ExistsJoin.cpp @@ -1,6 +1,6 @@ -// Copyright 2025, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #include "engine/ExistsJoin.h" @@ -19,6 +19,7 @@ ExistsJoin::ExistsJoin(QueryExecutionContext* qec, right_{std::move(right)}, joinColumns_{QueryExecutionTree::getJoinColumns(*left_, *right_)}, existsVariable_{std::move(existsVariable)} { + // Make sure that the left and right input are sorted on the join columns. std::tie(left_, right_) = QueryExecutionTree::createSortedTrees( std::move(left_), std::move(right_), joinColumns_); } @@ -37,7 +38,7 @@ VariableToColumnMap ExistsJoin::computeVariableToColumnMap() const { auto res = left_->getVariableColumns(); AD_CONTRACT_CHECK( !res.contains(existsVariable_), - "The target variable of an exists scan must be a new variable"); + "The target variable of an EXISTS join must be a new variable"); res[existsVariable_] = makeAlwaysDefinedColumn(getResultWidth() - 1); return res; } @@ -50,16 +51,20 @@ size_t ExistsJoin::getResultWidth() const { // ____________________________________________________________________________ vector ExistsJoin::resultSortedOn() const { + // We add one column to `left_`, but do not change the order of the rows. return left_->resultSortedOn(); } // ____________________________________________________________________________ float ExistsJoin::getMultiplicity(size_t col) { + // The multiplicities of all columns except the last one are the same as in + // `left_`. if (col < getResultWidth() - 1) { return left_->getMultiplicity(col); } - // The multiplicity of the boolean column can be a dummy value, as it should - // be never used for joins etc. + // For the added (Boolean) column we take a dummy value, assuming that it + // will not be used for subsequent joins or other operations that make use of + // the multiplicities. return 1; } @@ -82,13 +87,17 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { const auto& left = leftRes->idTable(); const auto& right = rightRes->idTable(); - // We reuse the generic `zipperJoinWithUndef` utility in the following way: - // It has (among others) two callbacks: One for each matching pair of rows - // from left and right, and one for rows in the left input that have no - // matching counterpart in the right input. The first callback can be a noop, - // and the second callback gives us exactly `NOT EXISTS`. - - // Only extract the join columns from both inputs to make the following code + // We reuse the generic `zipperJoinWithUndef` function, which has two two + // callbacks: one for each matching pair of rows from `left` and `right`, and + // one for rows in the left input that have no matching counterpart in the + // right input. The first callback can be a noop, and the second callback + // gives us exactly those rows, where the value in the to-be-added result + // column should be `false`. + // + // the inverse of the value needed for the added Boolean + // column. + + // Extract the join columns from both inputs to make the following code // easier. ad_utility::JoinColumnMapping joinColumnData{joinColumns_, left.numColumns(), right.numColumns()}; @@ -96,11 +105,11 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { left.asColumnSubsetView(joinColumnData.jcsLeft()); IdTableView<0> joinColumnsRight = right.asColumnSubsetView(joinColumnData.jcsRight()); - checkCancellation(); - // `isCheap` is true iff there are no UNDEF values in the join columns. In - // this case we can use a much cheaper algorithm. + // Compute `isCheap`, which is true iff there are no UNDEF values in the join + // columns (in which case we can use a simpler and cheaper join algorithm). + // // TODO There are many other cases where a cheaper implementation can // be chosen, but we leave those for another PR, this is the most common case. namespace stdr = ql::ranges; @@ -116,7 +125,8 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { // Nothing to do for the actual matches. auto noopRowAdder = ad_utility::noop; - // Store the indices of rows for which `exists` is `false`. + // Store the indices of rows for which the value of the `EXISTS` (in the added + // Boolean column) should be `false`. std::vector> notExistsIndices{ allocator()}; // The callback is called with iterators, so we convert them back to indices. @@ -125,8 +135,9 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { notExistsIndices.push_back(itLeft - begin); }; - // Run the actual zipper join, with the possible optimization if we know, that - // there can be no UNDEF values. + // Run `zipperJoinWithUndef` with the described callbacks and the mentioned + // optimization in case we know that there are no UNDEF values in the join + // columns. auto checkCancellationLambda = [this] { checkCancellation(); }; auto runZipperJoin = [&](auto findUndef) { [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( @@ -140,7 +151,8 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { runZipperJoin(ad_utility::findSmallerUndefRanges); } - // Set up the result; + // Add the result column from the computed `notExistsIndices` (which tell us + // where the value should be `false`). IdTable result = left.clone(); result.addEmptyColumn(); decltype(auto) existsCol = result.getColumn(getResultWidth() - 1); @@ -149,8 +161,8 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { existsCol[notExistsIndex] = Id::makeFromBool(false); } - // The result is a copy of the left input + and additional columns with only - // boolean values, so the local vocab of the left input is sufficient. + // The added column only contains Boolean values, and adds no new words to the + // local vocabulary, so we can simply copy the local vocab from `leftRes`. return {std::move(result), resultSortedOn(), leftRes->getCopyOfLocalVocab()}; } @@ -159,17 +171,20 @@ std::shared_ptr ExistsJoin::addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, const ad_utility::SharedCancellationHandle& cancellationHandle) { - // First extract all the `EXISTS` functions from the expression. + // Extract all `EXISTS` functions from the given `expression`. std::vector existsExpressions; expression.getPimpl()->getExistsExpressions(existsExpressions); - // For each of the EXISTS functions add one `ExistsJoin` + // For each `EXISTS` function, add the corresponding `ExistsJoin`. for (auto* expr : existsExpressions) { const auto& exists = dynamic_cast(*expr); - // Currently some FILTERs are applied multiple times especially when there - // are OPTIONAL joins in the query. In these cases we have to make sure that - // the `ExistsScan` is added only once. + // Currently some FILTERs are applied multiple times (in particular, this + // happens when there are OPTIONAL joins in the query). In these cases we + // have to make sure that the `ExistsJoin` is added only once. + // + // TODO(question from Hannah's review): Why does the following implement + // what the preceding comment says? if (subtree->isVariableCovered(exists.variable())) { continue; } diff --git a/src/engine/ExistsJoin.h b/src/engine/ExistsJoin.h index 4ff44fe94c..b319c304c9 100644 --- a/src/engine/ExistsJoin.h +++ b/src/engine/ExistsJoin.h @@ -1,13 +1,14 @@ -// Copyright 2025, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #pragma once #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" -// The implementation of the SPARQL `EXISTS` function. It takes two subtrees, +// The implementation of an "EXISTS join", which we use to realize the semantics +// of the SPARQL `EXISTS` function. The join takes two subtrees as input, and // and returns the left subtree with an additional boolean column that is `true` // iff at least one matching row is contained in the right subtree. class ExistsJoin : public Operation { @@ -17,25 +18,28 @@ class ExistsJoin : public Operation { std::shared_ptr right_; std::vector> joinColumns_; - // The variable of the added result column. + // The variable of the added (Boolean) result column. Variable existsVariable_; public: - // Constructor. The `existsVariable` (the variable for the added boolean - // column) must not yet be bound by `left`. + // Constructor. The `existsVariable` (the variable for the added column) must + // not yet be bound by `left`. ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable); // For a given subtree and a given expression, extract all the - // `ExistsExpressions` from the expression and add one `ExistsJoin` per - // `ExistsExpression` to the subtree. The left side of the `ExistsJoin` is the - // input subtree, the right hand side of the `ExistsJoin` as well as the + // `ExistsExpression`s from the expression and add one `ExistsJoin` per + // `ExistsExpression` to the subtree. The left side of the `ExistsJoin` is + // the input subtree, the right hand side of the `ExistsJoin` as well as the // variable to which the result is bound are extracted from the // `ExistsExpression`. The returned subtree can then be used to evaluate the - // `expression`. Note: `ExistsExpression` is a simple dummy that only reads - // the values of the column that is added by the `ExistsJoin`. + // `expression`. + // + // NOTE: `ExistsExpression` is a dummy that only reads the values of the + // column that is added by the `ExistsJoin`. The main work is done by the + // latter and not by the former. static std::shared_ptr addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, From 2bc5bdff745edc9320f4e2012fd9451947b9f2de Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Wed, 5 Feb 2025 03:26:17 +0100 Subject: [PATCH 23/30] Changes by Hannah improving documentation and comments --- src/engine/ExistsJoin.cpp | 29 +++++++------- src/engine/ExistsJoin.h | 30 ++++++++------- src/engine/GroupBy.cpp | 11 +++--- src/engine/MultiColumnJoin.cpp | 24 ++++++------ .../sparqlExpressions/ExistsExpression.h | 38 ++++++++++--------- .../sparqlExpressions/SparqlExpression.h | 2 +- .../sparqlParser/SparqlQleverVisitor.cpp | 30 ++++++++------- src/parser/sparqlParser/SparqlQleverVisitor.h | 10 +++-- test/ExceptionTest.cpp | 6 +-- test/QueryPlannerTest.cpp | 11 +++--- test/SparqlAntlrParserTest.cpp | 8 ++-- test/engine/ExistsJoinTest.cpp | 24 +++++++----- 12 files changed, 122 insertions(+), 101 deletions(-) diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp index 4e0b3b5bde..a58a22a47c 100644 --- a/src/engine/ExistsJoin.cpp +++ b/src/engine/ExistsJoin.cpp @@ -93,9 +93,6 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { // right input. The first callback can be a noop, and the second callback // gives us exactly those rows, where the value in the to-be-added result // column should be `false`. - // - // the inverse of the value needed for the added Boolean - // column. // Extract the join columns from both inputs to make the following code // easier. @@ -110,16 +107,17 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { // Compute `isCheap`, which is true iff there are no UNDEF values in the join // columns (in which case we can use a simpler and cheaper join algorithm). // - // TODO There are many other cases where a cheaper implementation can - // be chosen, but we leave those for another PR, this is the most common case. - namespace stdr = ql::ranges; + // TODO This is the most common case. There are many other cases + // where the generic `zipperJoinWithUndef` can be optimized. We will those + // for a later PR. size_t numJoinColumns = joinColumnsLeft.numColumns(); AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.numColumns()); - bool isCheap = stdr::none_of( + bool isCheap = ql::ranges::none_of( ad_utility::integerRange(numJoinColumns), [&](const auto& col) { - return (stdr::any_of(joinColumnsRight.getColumn(col), - &Id::isUndefined)) || - (stdr::any_of(joinColumnsLeft.getColumn(col), &Id::isUndefined)); + return (ql::ranges::any_of(joinColumnsRight.getColumn(col), + &Id::isUndefined)) || + (ql::ranges::any_of(joinColumnsLeft.getColumn(col), + &Id::isUndefined)); }); // Nothing to do for the actual matches. @@ -179,15 +177,14 @@ std::shared_ptr ExistsJoin::addExistsJoinsToSubtree( for (auto* expr : existsExpressions) { const auto& exists = dynamic_cast(*expr); - // Currently some FILTERs are applied multiple times (in particular, this - // happens when there are OPTIONAL joins in the query). In these cases we - // have to make sure that the `ExistsJoin` is added only once. - // - // TODO(question from Hannah's review): Why does the following implement - // what the preceding comment says? + // If we have already considered this `EXIST` (which we can detect by its + // variable), skip it. This can happen because some `FILTER`s (which may + // contain `EXISTS` functions) are applied multiple times (for example, + // when there are OPTIONAL joins in the query). if (subtree->isVariableCovered(exists.variable())) { continue; } + QueryPlanner qp{qec, cancellationHandle}; auto pq = exists.argument(); auto tree = diff --git a/src/engine/ExistsJoin.h b/src/engine/ExistsJoin.h index b319c304c9..43dbbe074f 100644 --- a/src/engine/ExistsJoin.h +++ b/src/engine/ExistsJoin.h @@ -9,8 +9,8 @@ // The implementation of an "EXISTS join", which we use to realize the semantics // of the SPARQL `EXISTS` function. The join takes two subtrees as input, and -// and returns the left subtree with an additional boolean column that is `true` -// iff at least one matching row is contained in the right subtree. +// returns the left subtree with an additional boolean column that is `true` iff +// at least one matching row is contained in the right subtree. class ExistsJoin : public Operation { private: // The left and right child. @@ -23,23 +23,27 @@ class ExistsJoin : public Operation { public: // Constructor. The `existsVariable` (the variable for the added column) must - // not yet be bound by `left`. + // not yet be bound in `left`. ExistsJoin(QueryExecutionContext* qec, std::shared_ptr left, std::shared_ptr right, Variable existsVariable); - // For a given subtree and a given expression, extract all the - // `ExistsExpression`s from the expression and add one `ExistsJoin` per - // `ExistsExpression` to the subtree. The left side of the `ExistsJoin` is - // the input subtree, the right hand side of the `ExistsJoin` as well as the - // variable to which the result is bound are extracted from the - // `ExistsExpression`. The returned subtree can then be used to evaluate the - // `expression`. + // Extract all `ExistsExpression`s from the given `expression`. For each + // `ExistsExpression`, add an `ExistsJoin`. The left side of the first + // `ExistsJoin` is the input `subtree`. The left side of subsequent + // `ExistsJoin`s is the previous `ExistsJoin`. The right side of each + // `ExistsJoin` is the argument of the respective `ExistsExpression`. When + // there are no `ExistsExpression`s, return the input `subtree` unchanged. // - // NOTE: `ExistsExpression` is a dummy that only reads the values of the - // column that is added by the `ExistsJoin`. The main work is done by the - // latter and not by the former. + // The returned subtree will contain one additional column for each + // `ExistsExpression`, which contains the result of the respective + // `ExistsJoin`. The `ExistsExpression` just reads the values of this column. + // The main work is done by the `ExistsJoin`. + // + // This function should be called in the constructor of each `Operation`, + // where an `EXISTS` expression can occur. For example, in the constructor of + // `BIND` and `FILTER`. static std::shared_ptr addExistsJoinsToSubtree( const sparqlExpression::SparqlExpressionPimpl& expression, std::shared_ptr subtree, QueryExecutionContext* qec, diff --git a/src/engine/GroupBy.cpp b/src/engine/GroupBy.cpp index 46ff7a410a..95ad6a6e51 100644 --- a/src/engine/GroupBy.cpp +++ b/src/engine/GroupBy.cpp @@ -1,8 +1,7 @@ -// Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: -// 2018 Florian Kramer (florian.kramer@mail.uni-freiburg.de) -// 2020- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) +// Copyright 2018 - 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Florian Kramer [2018 - 2020] +// Johannes Kalmbach #include "engine/GroupBy.h" @@ -54,11 +53,13 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector groupByVariables, auto sortColumns = computeSortColumns(subtree.get()); + // Aliases are like `BIND`s, which may contain `EXISTS` expressions. for (const auto& alias : _aliases) { subtree = ExistsJoin::addExistsJoinsToSubtree( alias._expression, std::move(subtree), getExecutionContext(), cancellationHandle_); } + _subtree = QueryExecutionTree::createSortedTree(std::move(subtree), sortColumns); } diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp index b605616ecb..a831c4cd55 100644 --- a/src/engine/MultiColumnJoin.cpp +++ b/src/engine/MultiColumnJoin.cpp @@ -1,6 +1,7 @@ -// Copyright 2018, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Florian Kramer (florian.kramer@netpun.uni-freiburg.de) +// Copyright 2018 - 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Florian Kramer [2018 - 2020] +// Johannes Kalmbach #include "MultiColumnJoin.h" @@ -237,15 +238,16 @@ void MultiColumnJoin::computeMultiColumnJoin( rowAdder.addRow(itLeft - beginLeft, itRight - beginRight); }; - // `isCheap` is true iff there are no UNDEF values in the join columns. In - // this case we can use a much cheaper algorithm. - // TODO There are many other cases where a cheaper implementation can - // be chosen, but we leave those for another PR, this is the most common case. - namespace stdr = ql::ranges; - bool isCheap = stdr::none_of(joinColumns, [&](const auto& jcs) { + // Compute `isCheap`, which is true iff there are no UNDEF values in the join + // columns (in which case we can use a simpler and cheaper join algorithm). + // + // TODO This is the most common case. There are many other cases + // where the generic `zipperJoinWithUndef` can be optimized. We will those + // for a later PR. + bool isCheap = ql::ranges::none_of(joinColumns, [&](const auto& jcs) { auto [leftCol, rightCol] = jcs; - return (stdr::any_of(right.getColumn(rightCol), &Id::isUndefined)) || - (stdr::any_of(left.getColumn(leftCol), &Id::isUndefined)); + return (ql::ranges::any_of(right.getColumn(rightCol), &Id::isUndefined)) || + (ql::ranges::any_of(left.getColumn(leftCol), &Id::isUndefined)); }); auto checkCancellationLambda = [this] { checkCancellation(); }; diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index 1313b342b0..b13071b657 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -1,6 +1,6 @@ -// Copyright 2025, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #pragma once @@ -9,18 +9,19 @@ #include "engine/sparqlExpressions/SparqlExpression.h" #include "parser/ParsedQuery.h" -// The expression that corresponds to the `EXISTS` function. -// The implementation only reads the value of a precomputed variable. The actual -// computation of EXISTS is done by the `ExistsJoin` class. +// The `SparqlExpression` for `EXISTS`. The implementation is straightforward +// because it only reads the value computed by the special `ExistsJoin` +// operation, where the actual work is done (see the comments there). namespace sparqlExpression { class ExistsExpression : public SparqlExpression { private: - // The argument (a group graph pattern) of the EXISTS. This is set during the - // parsing and is required and read by the `ExistsJoin` class. + // The argument of the `EXISTS`, which is a group graph pattern. This is set + // during parsing and is used by the `ExistsJoin` operation. ParsedQuery argument_; // Each `ExistsExpression` has a unique index and a unique variable name that - // is used to communicate between the `ExistsExpression` and the `ExistsJoin`. + // is used to communicate the result computed by the `ExistsJoin` to this + // `ExistsExpression`. static inline std::atomic indexCounter_ = 0; size_t index_ = ++indexCounter_; Variable variable_{absl::StrCat("?ql_internal_exists_", index_)}; @@ -30,29 +31,32 @@ class ExistsExpression : public SparqlExpression { const auto& argument() const { return argument_; } const auto& variable() const { return variable_; } - // Evaluate only reads the variable which is written by the `ExistsJoin`. + // To evaluate, just return the variable of the column computed by the + // `ExistsJoin`. ExpressionResult evaluate(EvaluationContext* context) const override { AD_CONTRACT_CHECK(context->_variableToColumnMap.contains(variable_)); return variable_; } - //____________________________________________________________________________ + // Return the cache key, which in the normal case depends on the column index + // of the variable computed by the `ExistsJoin`. + // + // There is a special case, where the corresponding `ExistsJoin` has not + // been set up yet (because the query planning is not yet complete). Since we + // cannot cache incomplete operations, we return a random cache key in this + // case. [[nodiscard]] string getCacheKey( const VariableToColumnMap& varColMap) const override { if (varColMap.contains(variable_)) { return absl::StrCat("ExistsExpression col# ", varColMap.at(variable_).columnIndex_); } else { - // This means that the necessary `ExistsJoin` hasn't been set up yet. This - // can for example happen if the parsing (which sets up the - // `ExistsExpression`) is completed, but the query planning (which sets up - // the `ExistsJoin` is still in progress). It is not possible to cache - // such incomplete operations, so we return a random cache key. return std::to_string(ad_utility::FastRandomIntGenerator{}()); } } - // This is in fact an `ExistsExpression`. + // This is the one expresssion, where this function should return `true`. + // Used to extract `EXISTS` expressions from a general expression tree. bool isExistsExpression() const override { return true; } private: diff --git a/src/engine/sparqlExpressions/SparqlExpression.h b/src/engine/sparqlExpressions/SparqlExpression.h index 7f5c551127..f033f27edc 100644 --- a/src/engine/sparqlExpressions/SparqlExpression.h +++ b/src/engine/sparqlExpressions/SparqlExpression.h @@ -127,7 +127,7 @@ class SparqlExpression { // implementation returns `false`. virtual bool isExistsExpression() const; - // Return non-null pointers to all `EXISTS` expressions in the subtree. + // Return non-null pointers to all `EXISTS` expressions in expression tree. // The result is passed in as a reference to simplify the recursive // implementation. virtual void getExistsExpressions( diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index da375c9f40..1fc6729e0e 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -1,9 +1,8 @@ -// Copyright 2021 - 2024, University of Freiburg +// Copyright 2021 - 2025, University of Freiburg // Chair of Algorithms and Data Structures -// Authors: -// 2021 - Hannah Bast -// 2022 Julian Mundhahs -// 2022 - Johannes Kalmbach +// Authors: Julian Mundhahs +// Hannah Bast +// Johannes Kalmbach #include "parser/sparqlParser/SparqlQleverVisitor.h" @@ -2443,28 +2442,33 @@ SparqlExpression::Ptr Visitor::visit(Parser::StrReplaceExpressionContext* ctx) { std::move(children.at(2))); } -// ____________________________________________________________________________________ +// ____________________________________________________________________________ ExpressionPtr Visitor::visitExists(Parser::GroupGraphPatternContext* pattern, bool negate) { - // The argument of the EXISTS is a completely independent GroupGraphPattern - // (except for the FROM [NAMED] clauses), so we have to back up and restore - // all global state when parsing EXISTS. + // The argument of 'EXISTS` is a `GroupGraphPattern` that is independent from + // the rest of the query (except for the `FROM` and `FROM NAMED` clauses, + // which also apply to the argument of `EXISTS`). We therefore have to back up + // and restore all global state when parsing `EXISTS`. auto queryBackup = std::exchange(parsedQuery_, ParsedQuery{}); - auto visibleVariablesSoFar = std::move(visibleVariables_); + auto visibleVariablesBackup = std::move(visibleVariables_); visibleVariables_.clear(); - // Parse the argument of EXISTS. + // Parse the argument of `EXISTS`. auto group = visit(pattern); ParsedQuery argumentOfExists = std::exchange(parsedQuery_, std::move(queryBackup)); argumentOfExists.selectClause().setAsterisk(); argumentOfExists._rootGraphPattern = std::move(group); - // EXISTS inherits the FROM [NAMED] clauses from the outer argumentOfExists. + // The argument of `EXISTS` inherits the `FROM` and `FROM NAMED` clauses from + // the outer query. argumentOfExists.datasetClauses_ = activeDatasetClauses_; - visibleVariables_ = std::move(visibleVariablesSoFar); + visibleVariables_ = std::move(visibleVariablesBackup); auto exists = std::make_unique( std::move(argumentOfExists)); + + // Handle `NOT EXISTS` (which is syntactically distinct from `! EXISTS`) by + // simply negating the `ExistsExpression`. if (negate) { return sparqlExpression::makeUnaryNegateExpression(std::move(exists)); } else { diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 2fd0d6bc9b..205309a5fa 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -79,10 +79,14 @@ class SparqlQleverVisitor { // `addVisibleVariable`. std::vector visibleVariables_{}; - // The FROM [NAMED] clauses of the query that is currently being parsed. - // Those are currently needed when parsing an EXISTS clause inside the query. + // The `FROM` and `FROM NAMED` clauses of the query that is currently + // being parsed. Those are inherited by certain constructs, which are + // otherwise independent (in particular, `EXISTS` and `DESCRIBE`). ParsedQuery::DatasetClauses activeDatasetClauses_; + + // The map from prefixes to their full IRIs. PrefixMap prefixMap_{}; + // We need to remember the prologue (prefix declarations) when we encounter it // because we need it when we encounter a SERVICE query. When there is no // prologue, this string simply remains empty. @@ -448,8 +452,6 @@ class SparqlQleverVisitor { ExpressionPtr visit(Parser::StrReplaceExpressionContext* ctx); - // The common implementation of the parsing of `EXISTS` and `NOT EXISTS`. - // The second argument is `true` for `NOT EXISTS`. ExpressionPtr visitExists(Parser::GroupGraphPatternContext* pattern, bool negate); diff --git a/test/ExceptionTest.cpp b/test/ExceptionTest.cpp index eaf0d0504d..4cc649ebc0 100644 --- a/test/ExceptionTest.cpp +++ b/test/ExceptionTest.cpp @@ -1,6 +1,6 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2023 - 2025, University of Freiburg +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach #include #include diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 89601732e8..688b4abaf3 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -1,7 +1,7 @@ -// Copyright 2015 - 2024, University of Freiburg +// Copyright 2015 - 2025, University of Freiburg // Chair of Algorithms and Data Structures // Authors: Björn Buchhold [2015 - 2017] -// Johannes Kalmbach [2018 - 2024] +// Johannes Kalmbach #include @@ -2919,10 +2919,11 @@ TEST(QueryPlanner, Exists) { auto xyz = h::IndexScanFromStrings("?x", "?y", "?z"); auto abc = h::IndexScanFromStrings("?a", "?b", "?c"); using V = Variable; + // Simple tests for EXISTS with FILTER, BIND, and GROUP BY. - h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", + h::expect("SELECT * { ?x ?y ?z FILTER EXISTS {?a ?b ?c} }", h::Filter("EXISTS {?a ?b ?c}", h::ExistsJoin(xyz, abc))); - h::expect("SELECT * { ?x ?y ?z BIND(EXISTS {?a ?b ?c} as ?bound)}", + h::expect("SELECT * { ?x ?y ?z BIND(EXISTS {?a ?b ?c} as ?bound) }", h::Bind(h::ExistsJoin(xyz, abc), "EXISTS {?a ?b ?c}", Variable("?bound"))); h::expect( @@ -2935,9 +2936,9 @@ TEST(QueryPlanner, Exists) { auto xyzg = h::IndexScanFromStrings("?x", "?y", "?z", {}, H{""}); auto abcg = h::IndexScanFromStrings("?a", "?b", "?c", {}, H{""}); + // Various uses of FILTER EXISTS. auto existsJoin = h::ExistsJoin(xyzg, abcg); auto filter = h::Filter("EXISTS {?a ?b ?c}", existsJoin); - // (use a lambda that only changes the beginning of the query). h::expect("SELECT * FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); h::expect("ASK FROM { ?x ?y ?z FILTER EXISTS {?a ?b ?c}}", filter); h::expect( diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index a5f00ba723..1515d3605a 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -1,4 +1,4 @@ -// Copyright 2021 - 2024, University of Freiburg +// Copyright 2021 - 2025, University of Freiburg // Chair of Algorithms and Data Structures // Authors: Johannes Kalmbach // Julian Mundhahs @@ -1890,8 +1890,9 @@ auto notExistsMatcher(Matcher pattern) { TEST(SparqlParser, Exists) { using namespace existsTestHelpers; auto expectBuiltInCall = ExpectCompleteParse<&Parser::builtInCall>{}; - // A matcher that matches the query `SELECT * { ?x ?foo}`, where the - // FROM and FROM NAMED clauses can still be specified via arguments. + + // A matcher that matches the query `SELECT * { ?x ?foo }`, where the + // FROM and FROM NAMED clauses can be specified as arguments. using Graphs = ScanSpecificationAsTripleComponent::Graphs; auto selectABarFooMatcher = [](Graphs defaultGraphs = std::nullopt, Graphs namedGraphs = std::nullopt) { @@ -1900,6 +1901,7 @@ TEST(SparqlParser, Exists) { m::GraphPattern(m::Triples({{Var{"?a"}, "", Var{"?foo"}}})), defaultGraphs, namedGraphs)); }; + expectBuiltInCall("EXISTS {?a ?foo}", existsMatcher(selectABarFooMatcher())); expectBuiltInCall("NOT EXISTS {?a ?foo}", diff --git a/test/engine/ExistsJoinTest.cpp b/test/engine/ExistsJoinTest.cpp index af72e5fbb6..197fdeeba6 100644 --- a/test/engine/ExistsJoinTest.cpp +++ b/test/engine/ExistsJoinTest.cpp @@ -1,4 +1,4 @@ -// Copyright 2024, University of Freiburg +// Copyright 2024 - 2025, University of Freiburg // Chair of Algorithms and Data Structures // Author: Johannes Kalmbach @@ -15,6 +15,13 @@ using namespace ad_utility::testing; namespace { + +// Helper function that computes an `ExistsJoin` of the given `leftInput` and +// `rightInput` and checks that the result columns is equal to `expectedAsBool`. +// The first `numJoinColumns` columns of both `leftInput` and `rightInput` are +// used as join columns. +// +// TODO Also test permutations of the join columns. void testExists(const VectorTable& leftInput, const VectorTable& rightInput, std::vector expectedAsBool, size_t numJoinColumns) { AD_CORRECTNESS_CHECK(leftInput.size() == expectedAsBool.size()); @@ -27,14 +34,12 @@ void testExists(const VectorTable& leftInput, const VectorTable& rightInput, using V = Variable; using Vars = std::vector>; - // TODO Support more than one join column. - // TODO also randomly permute the join columns. - + // Helper lambda `makeChild` that turns a `VectorTable` input into a + // `QueryExecutionTree` with a `ValuesForTesting` operation. auto joinCol = [](size_t i) { return V{absl::StrCat("?joinCol_", i)}; }; auto nonJoinCol = [i = 0]() mutable { return V{absl::StrCat("?nonJoinCol_", i++)}; }; - auto makeChild = [&](const IdTable& input) { Vars vars; for (size_t i : ad_utility::integerRange(numJoinColumns)) { @@ -48,11 +53,10 @@ void testExists(const VectorTable& leftInput, const VectorTable& rightInput, vars); }; + // Compute the `ExistsJoin` and check the result. auto exists = ExistsJoin{qec, makeChild(left), makeChild(right), V{"?exists"}}; - EXPECT_EQ(exists.getResultWidth(), left.numColumns() + 1); - auto res = exists.computeResultOnlyForTesting(); const auto& table = res.idTable(); ASSERT_EQ(table.numRows(), left.size()); @@ -69,20 +73,20 @@ TEST(Exists, computeResult) { testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, {true, false, true}, 1); - // UNDEF matches everything + // Single join column with one UNDEF (which always matches). auto U = Id::makeUndefined(); testExists({{U, 13}, {3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, {true, true, false, true}, 1); testExists({{3, 6}, {4, 7}, {5, 8}}, {{U, 15}}, {true, true, true}, 1); - // Two join columns + // Two join columns. testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 15}, {3, 19}, {5, 37}}, {false, false, false}, 2); testExists({{3, 6}, {4, 7}, {5, 8}}, {{3, 6, 11}, {3, 19, 7}, {4, 8, 0}, {5, 8, 37}}, {true, false, true}, 2); - // Two join columns with UNDEF + // Two join columns with UNDEFs in each column. testExists({{2, 2}, {3, U}, {4, 8}, {5, 8}}, {{U, 8}, {3, 15}, {3, 19}, {5, U}, {5, 37}}, {false, true, true, true}, 2); From c2abaddabd663f0d53339a880c491ca9e73452e5 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Wed, 5 Feb 2025 03:39:49 +0100 Subject: [PATCH 24/30] Fix typo --- src/engine/sparqlExpressions/ExistsExpression.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index b13071b657..dd880bdb09 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -55,7 +55,7 @@ class ExistsExpression : public SparqlExpression { } } - // This is the one expresssion, where this function should return `true`. + // This is the one expression, where this function should return `true`. // Used to extract `EXISTS` expressions from a general expression tree. bool isExistsExpression() const override { return true; } From ee495f41a29e153f0d972a79c47e9e468d27120d Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 14 Feb 2025 18:55:08 +0100 Subject: [PATCH 25/30] The test is currently not compiling, as we still have to apply several changes. Signed-off-by: Johannes Kalmbach --- src/engine/ExistsJoin.cpp | 49 +++++++++++++++++++++------------- test/SparqlAntlrParserTest.cpp | 16 +++++++++++ test/engine/ExistsJoinTest.cpp | 48 ++++++++++++++++++++------------- 3 files changed, 75 insertions(+), 38 deletions(-) diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp index a58a22a47c..94ca355b4b 100644 --- a/src/engine/ExistsJoin.cpp +++ b/src/engine/ExistsJoin.cpp @@ -4,6 +4,7 @@ #include "engine/ExistsJoin.h" +#include "CallFixedSize.h" #include "engine/QueryPlanner.h" #include "engine/sparqlExpressions/ExistsExpression.h" #include "engine/sparqlExpressions/SparqlExpression.h" @@ -127,27 +128,37 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { // Boolean column) should be `false`. std::vector> notExistsIndices{ allocator()}; - // The callback is called with iterators, so we convert them back to indices. - auto actionForNotExisting = - [¬ExistsIndices, begin = joinColumnsLeft.begin()](const auto& itLeft) { - notExistsIndices.push_back(itLeft - begin); - }; - - // Run `zipperJoinWithUndef` with the described callbacks and the mentioned - // optimization in case we know that there are no UNDEF values in the join + // Run the actual exists join, but use `callFixedSize` for the number of join // columns. - auto checkCancellationLambda = [this] { checkCancellation(); }; - auto runZipperJoin = [&](auto findUndef) { - [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( - joinColumnsLeft, joinColumnsRight, ql::ranges::lexicographical_compare, - noopRowAdder, findUndef, findUndef, actionForNotExisting, - checkCancellationLambda); + auto runForNumJoinCols = [¬ExistsIndices, isCheap, &noopRowAdder, + &colsLeftDynamic = joinColumnsLeft, + &colsRightDynamic = joinColumnsRight, + this]() { + // The callback is called with iterators, so we convert them back to + // indices. + auto joinColumnsLeft = colsLeftDynamic.asStaticView(); + auto joinColumnsRight = colsRightDynamic.asStaticView(); + auto actionForNotExisting = + [¬ExistsIndices, begin = joinColumnsLeft.begin()]( + const auto& itLeft) { notExistsIndices.push_back(itLeft - begin); }; + + // Run `zipperJoinWithUndef` with the described callbacks and the mentioned + // optimization in case we know that there are no UNDEF values in the join + // columns. + auto checkCancellationLambda = [this] { checkCancellation(); }; + auto runZipperJoin = [&](auto findUndef) { + [[maybe_unused]] auto numOutOfOrder = ad_utility::zipperJoinWithUndef( + joinColumnsLeft, joinColumnsRight, + ql::ranges::lexicographical_compare, noopRowAdder, findUndef, + findUndef, actionForNotExisting, checkCancellationLambda); + }; + if (isCheap) { + runZipperJoin(ad_utility::noop); + } else { + runZipperJoin(ad_utility::findSmallerUndefRanges); + } }; - if (isCheap) { - runZipperJoin(ad_utility::noop); - } else { - runZipperJoin(ad_utility::findSmallerUndefRanges); - } + ad_utility::callFixedSize(numJoinColumns, runForNumJoinCols); // Add the result column from the computed `notExistsIndices` (which tell us // where the value should be `false`). diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 181d395446..49df3bfe98 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -52,9 +52,11 @@ const ad_utility::HashMap defaultPrefixMap{ template auto parse = [](const string& input, SparqlQleverVisitor::PrefixMap prefixes = {}, + ParsedQuery::DatasetClauses clauses = {}, SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks = SparqlQleverVisitor::DisableSomeChecksOnlyForTesting::False) { ParserAndVisitor p{input, std::move(prefixes), disableSomeChecks}; + // TODO also propagate the active dataset clauses. if (testInsideConstructTemplate) { p.visitor_.setParseModeToInsideConstructTemplateForTesting(); } @@ -115,6 +117,20 @@ struct ExpectCompleteParse { matcher, l); }); }; + + auto operator()(const string& input, + const testing::Matcher& matcher, + ParsedQuery::DatasetClauses activeDatasetClauses, + ad_utility::source_location l = + ad_utility::source_location::current()) const { + auto tr = generateLocationTrace(l, "successful parsing was expected here"); + EXPECT_NO_THROW({ + return expectCompleteParse( + parse( + input, std::move(prefixMap), disableSomeChecks), + matcher, l); + }); + }; }; template diff --git a/test/engine/ExistsJoinTest.cpp b/test/engine/ExistsJoinTest.cpp index 40233e9097..e16d9b3ba7 100644 --- a/test/engine/ExistsJoinTest.cpp +++ b/test/engine/ExistsJoinTest.cpp @@ -15,6 +15,12 @@ using namespace ad_utility::testing; namespace { + +// Helper function that computes an `ExistsJoin` of the given `left` and +// `right` and checks that the result columns is equal to `expectedAsBool`. +// The first `numJoinColumns` columns of both `leftInput` and `rightInput` are +// used as join columns. +// void testExistsFromIdTable(IdTable left, IdTable right, std::vector expectedAsBool, size_t numJoinColumns) { @@ -22,27 +28,29 @@ void testExistsFromIdTable(IdTable left, IdTable right, AD_CORRECTNESS_CHECK(left.numColumns() >= numJoinColumns); AD_CORRECTNESS_CHECK(right.numColumns() >= numJoinColumns); - // Permute the join columns. - auto colsLeft = ad_utility::integerRange(left.numColumns()); - std::vector leftPermutation; - ql::ranges::copy(colsLeft, std::back_inserter(leftPermutation)); - left.setColumnSubset(leftPermutation); - - auto colsRight = ad_utility::integerRange(right.numColumns()); - std::vector rightPermutation; - ql::ranges::copy(colsRight, std::back_inserter(rightPermutation)); - right.setColumnSubset(rightPermutation); - - // The expected output depends on the (sorted) input, even if we shuffle it - // afterward. + // Randomly permute the columns of the `input` and return the permutation that + // was applied + auto permuteColumns = [](auto& table) { + auto colsView = ad_utility::integerRange(table.numColumns()); + std::vector permutation; + ql::ranges::copy(colsView, std::back_inserter(permutation)); + table.setColumnSubset(permutation); + return permutation; + }; + // Permute the columns. + auto leftPermutation = permuteColumns(left); + auto rightPermutation = permuteColumns(right); + + // We have to make the deep copy of `left` for the expected result at exactly + // this point: The permutation of the columns (above) also affects the + // expected result, while the permutation of the rows (which will be applied + // below) doesn't affect it, as the `ExistsJoin` internally sorts its inputs. IdTable expected = left.clone(); // Randomly shuffle the inputs, to ensure that the `existsJoin` correctly // pre-sorts its inputs. - std::random_device rd; - std::mt19937 g(rd()); - std::shuffle(left.begin(), left.end(), g); - std::shuffle(right.begin(), right.end(), g); + ad_utility::randomShuffle(left.begin(), left.end()); + ad_utility::randomShuffle(right.begin(), right.end()); auto qec = getQec(); using V = Variable; @@ -69,8 +77,8 @@ void testExistsFromIdTable(IdTable left, IdTable right, }; // Compute the `ExistsJoin` and check the result. - auto exists = - ExistsJoin{qec, makeChild(left), makeChild(right), V{"?exists"}}; + auto exists = ExistsJoin{qec, makeChild(left, leftPermutation), + makeChild(right, rightPermutation), V{"?exists"}}; EXPECT_EQ(exists.getResultWidth(), left.numColumns() + 1); auto res = exists.computeResultOnlyForTesting(); const auto& table = res.idTable(); @@ -81,6 +89,8 @@ void testExistsFromIdTable(IdTable left, IdTable right, EXPECT_THAT(table, matchesIdTable(expected)); } +// Same as the function above, but conveniently takes `VectorTable`s instead of +// `IdTable`s. void testExists(const VectorTable& leftInput, const VectorTable& rightInput, std::vector expectedAsBool, size_t numJoinColumns) { auto left = makeIdTableFromVector(leftInput); From ca30b5a217f384955213235b30e7d680d6a7ba30 Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 14 Feb 2025 20:01:02 +0100 Subject: [PATCH 26/30] Also test different datasets. Signed-off-by: Johannes Kalmbach --- src/parser/sparqlParser/SparqlQleverVisitor.h | 5 +++ test/SparqlAntlrParserTest.cpp | 31 ++++++++++++++++--- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index aa5ab52197..4e761934d1 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -120,6 +120,11 @@ class SparqlQleverVisitor { isInsideConstructTriples_ = true; } + void setActiveDatasetClausesForTesting( + ParsedQuery::DatasetClauses datasetClauses) { + activeDatasetClauses_ = std::move(datasetClauses); + } + // ___________________________________________________________________________ ParsedQuery visit(Parser::QueryOrUpdateContext* ctx); diff --git a/test/SparqlAntlrParserTest.cpp b/test/SparqlAntlrParserTest.cpp index 49df3bfe98..9deb2fc0ef 100644 --- a/test/SparqlAntlrParserTest.cpp +++ b/test/SparqlAntlrParserTest.cpp @@ -56,7 +56,7 @@ auto parse = SparqlQleverVisitor::DisableSomeChecksOnlyForTesting disableSomeChecks = SparqlQleverVisitor::DisableSomeChecksOnlyForTesting::False) { ParserAndVisitor p{input, std::move(prefixes), disableSomeChecks}; - // TODO also propagate the active dataset clauses. + p.visitor_.setActiveDatasetClausesForTesting(std::move(clauses)); if (testInsideConstructTemplate) { p.visitor_.setParseModeToInsideConstructTemplateForTesting(); } @@ -113,7 +113,7 @@ struct ExpectCompleteParse { EXPECT_NO_THROW({ return expectCompleteParse( parse( - input, std::move(prefixMap), disableSomeChecks), + input, std::move(prefixMap), {}, disableSomeChecks), matcher, l); }); }; @@ -127,7 +127,7 @@ struct ExpectCompleteParse { EXPECT_NO_THROW({ return expectCompleteParse( parse( - input, std::move(prefixMap), disableSomeChecks), + input, {}, std::move(activeDatasetClauses), disableSomeChecks), matcher, l); }); }; @@ -152,7 +152,7 @@ struct ExpectParseFails { ad_utility::source_location l = ad_utility::source_location::current()) { auto trace = generateLocationTrace(l); AD_EXPECT_THROW_WITH_MESSAGE( - parse(input, std::move(prefixMap), disableSomeChecks), + parse(input, std::move(prefixMap), {}, disableSomeChecks), messageMatcher); } }; @@ -1954,6 +1954,29 @@ TEST(SparqlParser, Exists) { existsMatcher(selectABarFooMatcher())); expectBuiltInCall("NOT EXISTS {?a ?foo}", notExistsMatcher(selectABarFooMatcher())); + + Graphs defaultGraphs{ad_utility::HashSet{iri("")}}; + Graphs namedGraphs{ad_utility::HashSet{iri("")}}; + + // Now run the same tests, but with non-empty dataset clauses, that have to be + // propagated to the `ParsedQuery` stored inside the `ExistsExpression`. + ParsedQuery::DatasetClauses datasetClauses; + datasetClauses.defaultGraphs_ = defaultGraphs; + datasetClauses.namedGraphs_ = namedGraphs; + datasetClauses.defaultGraphs_.value().insert(iri("")); + expectBuiltInCall("EXISTS {?a ?foo}", + existsMatcher(selectABarFooMatcher())); + expectBuiltInCall("NOT EXISTS {?a ?foo}", + notExistsMatcher(selectABarFooMatcher())); + + expectBuiltInCall( + "EXISTS {?a ?foo}", + existsMatcher(selectABarFooMatcher(defaultGraphs, namedGraphs)), + datasetClauses); + expectBuiltInCall( + "NOT EXISTS {?a ?foo}", + notExistsMatcher(selectABarFooMatcher(defaultGraphs, namedGraphs)), + datasetClauses); } namespace aggregateTestHelpers { From d48d76b0a12bbd49c222130d0e7a3681293c34bf Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Fri, 14 Feb 2025 20:06:54 +0100 Subject: [PATCH 27/30] Fix the name of the conformance test-suite Signed-off-by: Johannes Kalmbach --- .github/workflows/sparql-conformance.yml | 2 ++ .github/workflows/upload-sparql-conformance.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sparql-conformance.yml b/.github/workflows/sparql-conformance.yml index 0b496ccefc..8ecdbfebb6 100644 --- a/.github/workflows/sparql-conformance.yml +++ b/.github/workflows/sparql-conformance.yml @@ -1,3 +1,5 @@ +name: sparql-test-suite + on: push: branches: [ master ] diff --git a/.github/workflows/upload-sparql-conformance.yml b/.github/workflows/upload-sparql-conformance.yml index fe9be1580c..390c4e446a 100644 --- a/.github/workflows/upload-sparql-conformance.yml +++ b/.github/workflows/upload-sparql-conformance.yml @@ -2,7 +2,7 @@ name: Upload conformance tests result on: workflow_run: - # This has to be the `name:` of the workflow in `code_coverage.yml`. + # This has to be the `name:` of the workflow in `sparql-conformance.yml`. # Start when this workflow has finished successfully. workflows: [sparql-test-suite] types: From cfe3c17ef53c1b67dee05d08ec63a49a54461ce8 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 14 Feb 2025 21:26:44 +0100 Subject: [PATCH 28/30] Minor improvements from Hannah's review --- src/engine/ExistsJoin.cpp | 12 ++++++------ src/engine/sparqlExpressions/ExistsExpression.h | 7 ++++--- test/SparqlExpressionTest.cpp | 8 +++++--- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/engine/ExistsJoin.cpp b/src/engine/ExistsJoin.cpp index 94ca355b4b..902f551ddb 100644 --- a/src/engine/ExistsJoin.cpp +++ b/src/engine/ExistsJoin.cpp @@ -109,8 +109,8 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { // columns (in which case we can use a simpler and cheaper join algorithm). // // TODO This is the most common case. There are many other cases - // where the generic `zipperJoinWithUndef` can be optimized. We will those - // for a later PR. + // where the generic `zipperJoinWithUndef` can be optimized. This is work for + // a future PR. size_t numJoinColumns = joinColumnsLeft.numColumns(); AD_CORRECTNESS_CHECK(numJoinColumns == joinColumnsRight.numColumns()); bool isCheap = ql::ranges::none_of( @@ -128,14 +128,14 @@ ProtoResult ExistsJoin::computeResult([[maybe_unused]] bool requestLaziness) { // Boolean column) should be `false`. std::vector> notExistsIndices{ allocator()}; - // Run the actual exists join, but use `callFixedSize` for the number of join - // columns. + // Helper lambda for computing the exists join with `callFixedSize`, which + // makes the number of join columns a template parameter. auto runForNumJoinCols = [¬ExistsIndices, isCheap, &noopRowAdder, &colsLeftDynamic = joinColumnsLeft, &colsRightDynamic = joinColumnsRight, this]() { - // The callback is called with iterators, so we convert them back to - // indices. + // The `actionForNotExisting` callback gets iterators as input, but should + // output indices, hence the pointer arithmetic. auto joinColumnsLeft = colsLeftDynamic.asStaticView(); auto joinColumnsRight = colsRightDynamic.asStaticView(); auto actionForNotExisting = diff --git a/src/engine/sparqlExpressions/ExistsExpression.h b/src/engine/sparqlExpressions/ExistsExpression.h index 675209dbcc..afaa026344 100644 --- a/src/engine/sparqlExpressions/ExistsExpression.h +++ b/src/engine/sparqlExpressions/ExistsExpression.h @@ -51,9 +51,10 @@ class ExistsExpression : public SparqlExpression { return absl::StrCat("ExistsExpression col# ", varColMap.at(variable_).columnIndex_); } else { - // This means that the necessary `ExistsJoin` hasn't been set up yet. This - // can for example happen if `getCacheKey` is called during the query - // planning. + // This means that the necessary `ExistsJoin` hasn't been set up yet. For + // example, this can happen if `getCacheKey` is called during query + // planning (which is done to avoid redundant evaluation in the case of + // identical subtrees in the query plan). return absl::StrCat("Uninitialized Exists: ", ad_utility::FastRandomIntGenerator{}()); } diff --git a/test/SparqlExpressionTest.cpp b/test/SparqlExpressionTest.cpp index aa6e1fc909..d53f3ebc4f 100644 --- a/test/SparqlExpressionTest.cpp +++ b/test/SparqlExpressionTest.cpp @@ -1468,9 +1468,11 @@ TEST(SingleUseExpression, simpleMembersForTestCoverage) { EXPECT_ANY_THROW(expression.getCacheKey({})); } -// The actual implementation of EXISTS is done in the ExistsJoin, which is also -// properly tested. -TEST(ExistsExpression, dummyTests) { +// This just tests basic functionality of the `ExistsExpression` class. Since +// the actual implementation of the `EXISTS` operator is done in the +// `ExistsJoin` class, most of the testing happens in +// `test/engine/ExistsJoinTest.cpp`. +TEST(ExistsExpression, basicFunctionality) { ExistsExpression exists{ParsedQuery{}}; auto var = exists.variable(); TestContext context; From 608d0ea55ab442602671d2221f4151a7ce55a4ed Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 14 Feb 2025 21:58:01 +0100 Subject: [PATCH 29/30] Re-insert the `baseIri_` declaration in `SparqlQleverVisitor.h` It got lost in the merge conflict resolution. --- src/parser/sparqlParser/SparqlQleverVisitor.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.h b/src/parser/sparqlParser/SparqlQleverVisitor.h index 395303d2f5..412f2677f6 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.h +++ b/src/parser/sparqlParser/SparqlQleverVisitor.h @@ -88,6 +88,9 @@ class SparqlQleverVisitor { // The map from prefixes to their full IRIs. PrefixMap prefixMap_{}; + // The `BASE` IRI of the query if any. + ad_utility::triple_component::Iri baseIri_{}; + // We need to remember the prologue (prefix declarations) when we encounter it // because we need it when we encounter a SERVICE query. When there is no // prologue, this string simply remains empty. From 092e0d91d22d3e541253dfcce728fd6cd5d5065e Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 15 Feb 2025 15:26:08 +0100 Subject: [PATCH 30/30] Revert changes in .github/workflows --- .github/workflows/sparql-conformance.yml | 2 +- .../workflows/upload-sparql-conformance.yml | 61 ------------------- 2 files changed, 1 insertion(+), 62 deletions(-) delete mode 100644 .github/workflows/upload-sparql-conformance.yml diff --git a/.github/workflows/sparql-conformance.yml b/.github/workflows/sparql-conformance.yml index 8ecdbfebb6..3e4bdfd63d 100644 --- a/.github/workflows/sparql-conformance.yml +++ b/.github/workflows/sparql-conformance.yml @@ -1,4 +1,4 @@ -name: sparql-test-suite +name: sparql-conformance on: push: diff --git a/.github/workflows/upload-sparql-conformance.yml b/.github/workflows/upload-sparql-conformance.yml deleted file mode 100644 index 390c4e446a..0000000000 --- a/.github/workflows/upload-sparql-conformance.yml +++ /dev/null @@ -1,61 +0,0 @@ -name: Upload conformance tests result - -on: - workflow_run: - # This has to be the `name:` of the workflow in `sparql-conformance.yml`. - # Start when this workflow has finished successfully. - workflows: [sparql-test-suite] - types: - - completed - -jobs: - upload: - runs-on: ubuntu-latest - if: > - github.event.workflow_run.event == 'pull_request' && - github.event.workflow_run.conclusion == 'success' - steps: - - name: 'Download artifact' - uses: actions/github-script@v6 - # The following script is taken from the link stated at the - # beginning of this file. It manually downloads an artifact - # from another workflow. - with: - script: | - var artifacts = await github.rest.actions.listWorkflowRunArtifacts({ - owner: context.repo.owner, - repo: context.repo.repo, - run_id: ${{github.event.workflow_run.id }}, - }); - var matchArtifact = artifacts.data.artifacts.filter((artifact) => { - return artifact.name == "conformance-report" - })[0]; - var download = await github.rest.actions.downloadArtifact({ - owner: context.repo.owner, - repo: context.repo.repo, - artifact_id: matchArtifact.id, - archive_format: 'zip', - }); - var fs = require('fs'); - fs.writeFileSync('${{github.workspace}}/conformance-report.zip', Buffer.from(download.data)); - - run: unzip conformance-report.zip - # Read the metadata into environment variables. - - name: "Read PR number" - run: echo "pr_number=`cat pr`" >> $GITHUB_ENV - - name: "Read Github Ref" - run: echo "original_github_ref=`cat github_ref`" >> $GITHUB_ENV; - - name: "Read Github SHA" - run: echo "commit_sha=`cat sha`" >> $GITHUB_ENV; - - name: "Read Github Repository" - run: echo "original_github_repository=`cat github_repository`" >> $GITHUB_ENV; - # We have to check out the source code from the PR, otherwise Codecov - # won't process the upload properly. We first check it out into a - # subdirectory `qlever-source`, otherwise the coverage report will - # be overwritten. We then move all the files back into the working - # directory such that Codecov will pick them up properly. - - name: "Submit data to server" - env: - SERVER_URL: ${{ secrets.SPARQL_CONFORMANCE_SERVER_URL }} - API_KEY: ${{ secrets.SPARQL_CONFORMANCE_SERVER_KEY }} - run: | - curl -H "x-api-key: $API_KEY" -H "event: ${{github.event.workflow_run.event}}" -H "sha: ${{env.commit_sha}}" -H "pr-number: ${{env.pr_number}}" -H "repo: ${{env.original_github_repository}}" -F "file=@${{env.commit_sha}}.json.bz2" $SERVER_URL/upload \ No newline at end of file